-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy path1. what.py
27 lines (21 loc) · 886 Bytes
/
1. what.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import codecademylib3_seaborn
from preprocessing import preprocess_text
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# sample documents
document_1 = "This is a sample piece of text!"
document_2 = "This is my second sentence of text."
document_3 = "Is this my third sentence of text?"
# corpus of documents
corpus = [document_1, document_2, document_3]
# preprocess documents
processed_corpus = [preprocess_text(doc) for doc in corpus]
# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tf_idf_scores = vectorizer.fit_transform(processed_corpus)
# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in processed_corpus]
# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(), index=feature_names, columns=corpus_index)
print(df_tf_idf)