You need to sign in or sign up before continuing.
Commit e1bebfc7 by Febby Simanjuntak

update

parent f27cbbb1
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset and Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
" \n",
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
" \n",
"# this is a very toy example, do not try this at home unless you want to understand the usage differences\n",
"docs=[\"the\", \"to\", \"ect\", \"and\", \"for\", \"of\", \"a\", \"you\", \"hou\", \"in\", \"on\", \"is\", \"this\", \"enron\", \"i\", \"be\", \"that\", \"will\",\n",
" \"have\", \"with\", \"your\",\"at\", \"we\", \"are\", \"it\", \"by\", \"com\", \"as\", \"from\", \"gas\", \"or\",\"not\", \"not\", \"me\", \"deal\", \"if\",\n",
" \"meter\",\"hpl\", \"please\",\"re\", \"e\", \"any\", \"our\", \"corp\",\"can\", \"d\", \"all\", \"has\", \"was\", \"know\", \"need\", \"an\", \"forwarded\", \n",
" \"new\", \"t\", \"may\", \"up\", \"j\",\"should\", \"do\", \"am\", \"out\", \"see\", \"no\", \"there\", \"price\", \"daren\", \"but\", \"been\", \"company\", \n",
" \"I\", \"these\", \"let\", \"so\", \"would\", \"m\", \"into\", \"xls\", \"farmer\", \"attached\", \"us\", \"information\", \"they\", \"message\", \n",
" \"day\", \"time\", \"my\", \"one\", \"what\", \"only\", \"http\", \"th\", \"volume\", \"mail\", \"contract\", \"which\", \"month\",\n",
" \"more\", \"robert\", \"sitara\", \"obout\", \"texas\", \"nom\", \"energy\", \"pec\", \"questions\", \"www\", \"deals\", \"volumes\", \"pm\", \"ena\",\n",
" \"now\", \"their\", \"file\", \"some\", \"email\", \"just\", \"also\", \"call\", \"change\", \"other\", \"here\", \"like\", \"b\", \"flow\", \"net\", \n",
" \"following\", \"p\", \"production\",\"when\", \"over\", \"back\", \"want\", \"original\", \"them\", \"below\", \"o\", \"ticket\", \"c\", \"he\",\n",
" \"could\", \"make\", \"inc\", \"report\", \"march\", \"contact\", \"were\", \"days\", \"list\", \"nomination\", \"system\", \"who\", \"april\", \n",
" \"number\", \"sale\", \"don\", \"its\", \"first\", \"thanks\", \"business\",\"help\", \"per\", \"through\", \"july\", \"forward\", \"font\", \"free\", \n",
" \"daily\", \"use\", \"order\", \"today\", \"r\", \"had\", \"fw\", \"set\", \"plant\", \"statements\", \"go\", \"gary\", \"oil\", \"line\", \"sales\", \n",
" \"w\", \"effective\", \"well\", \"tenaska\", \"take\",\"june\",\"x\", \"within\",\"nbsp\", \"she\", \"how\", \"north\", \"america\", \"being\", \n",
" \"under\", \"next\", \"week\", \"than\", \"january,\" \"la\"\n",
" ]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Initialize CountVectorizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#instantiate CountVectorizer()\n",
"cv=CountVectorizer()\n",
" \n",
"# this steps generates word counts for the words in your docs\n",
"word_count_vector=cv.fit_transform(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"word_count_vector.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute the IDF values"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)\n",
"tfidf_transformer.fit(word_count_vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# print idf values\n",
"df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=[\"idf_weights\"])\n",
" \n",
"# sort ascending\n",
"df_idf.sort_values(by=['idf_weights'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Compute the TFIDF score for your documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# count matrix\n",
"count_vector=cv.transform(docs)\n",
" \n",
"# tf-idf scores\n",
"tf_idf_vector=tfidf_transformer.transform(count_vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"feature_names = cv.get_feature_names()\n",
" \n",
"#get tfidf vector for first document\n",
"first_document_vector=tf_idf_vector[0]\n",
" \n",
"#print the scores\n",
"df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=[\"tfidf\"])\n",
"df.sort_values(by=[\"tfidf\"],ascending=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tfidfvectorizer Usage"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer \n",
" \n",
"# settings that you use for count vectorizer will go here\n",
"tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
" \n",
"# just send in all your docs here\n",
"tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get the first vector out (for the first document)\n",
"first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]\n",
" \n",
"# place tf-idf values in a pandas data frame\n",
"df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=[\"tfidf\"])\n",
"df.sort_values(by=[\"tfidf\"],ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tfidf_vectorizer=TfidfVectorizer(use_idf=True)\n",
" \n",
"# just send in all your docs here\n",
"fitted_vectorizer=tfidf_vectorizer.fit(docs)\n",
"tfidf_vectorizer_vectors=fitted_vectorizer.transform(docs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment