-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy path5. topic.py
48 lines (40 loc) · 2.27 KB
/
5. topic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import nltk, re
from sherlock_holmes import bohemia_ch1, bohemia_ch2, bohemia_ch3, boscombe_ch1, boscombe_ch2, boscombe_ch3
from preprocessing import preprocess_text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# preparing the text
corpus = [bohemia_ch1, bohemia_ch2, bohemia_ch3, boscombe_ch1, boscombe_ch2, boscombe_ch3]
preprocessed_corpus = [preprocess_text(chapter) for chapter in corpus]
# Update stop_list:
stop_list = ["and", "or", "just", "am", "are", "please","sorry", "I", "you", "tey"]
# filtering topics for stop words
def filter_out_stop_words(corpus):
no_stops_corpus = ["say", "see", "holmes", "shall", "say", "man", "upon", "know", "quite", "one", "well", "could", "would", "take", "may", "think", "come", "go", "little", "must", "look"]
for chapter in corpus:
no_stops_chapter = " ".join([word for word in chapter.split(" ") if word not in stop_list])
no_stops_corpus.append(no_stops_chapter)
return no_stops_corpus
filtered_for_stops = filter_out_stop_words(preprocessed_corpus)
# creating the bag of words model
bag_of_words_creator = CountVectorizer()
bag_of_words = bag_of_words_creator.fit_transform(filtered_for_stops)
# creating the tf-idf model
tfidf_creator = TfidfVectorizer(min_df = 0.2)
tfidf = tfidf_creator.fit_transform(preprocessed_corpus)
# creating the bag of words LDA model
lda_bag_of_words_creator = LatentDirichletAllocation(learning_method='online', n_components=10)
lda_bag_of_words = lda_bag_of_words_creator.fit_transform(bag_of_words)
# creating the tf-idf LDA model
lda_tfidf_creator = LatentDirichletAllocation(learning_method='online', n_components=10)
lda_tfidf = lda_tfidf_creator.fit_transform(tfidf)
print("~~~ Topics found by bag of words LDA ~~~")
for topic_id, topic in enumerate(lda_bag_of_words_creator.components_):
message = "Topic #{}: ".format(topic_id + 1)
message += " ".join([bag_of_words_creator.get_feature_names()[i] for i in topic.argsort()[:-5 :-1]])
print(message)
print("\n\n~~~ Topics found by tf-idf LDA ~~~")
for topic_id, topic in enumerate(lda_tfidf_creator.components_):
message = "Topic #{}: ".format(topic_id + 1)
message += " ".join([tfidf_creator.get_feature_names()[i] for i in topic.argsort()[:-5 :-1]])
print(message)