-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcord19_eda.py
98 lines (80 loc) · 2.97 KB
/
cord19_eda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
'''
cord19_eda.py : Visualize COVID-19 transmission related articles for exploratory data analysis
'''
import matplotlib.pyplot as plt
import pandas as pd
import utils_text as utils_text
import utils_vis as utils_vis
from wordcloud import WordCloud
pd.options.display.max_rows = 200
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 10)
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 80)
data = 'data/cord19_transmission_processed.csv'
df = pd.read_csv(data, header=0, sep=',')
plt.figure(figsize=(20, 12))
wc = WordCloud(height=2000, width=2000, background_color='lightgrey', colormap='copper_r')
wc = wc.generate(' '.join(df['abstract_processed']))
plt.imshow(wc, interpolation='bilinear')
# plt.title('Common Words in COVID-19 Transmission Related Articles', size=18)
plt.axis('off')
# plt.show()
plt.savefig('output/wordcloud.png')
ngrams = open('output/ngrams.txt', 'w')
# Unigrams
unigrams = utils_text.get_ngrams_df(df, 'abstract_processed', 1)
print('Unigrams\n{}'.format(unigrams.head(n=300)), file=ngrams)
plt.figure(figsize=(20, 12))
x_axis = unigrams['N-Gram'][:20]
y_axis = unigrams['Frequency'][:20]
plt.bar(x_axis, y_axis, color='sienna')
plt.xlabel('Unigram', size=14)
plt.xticks(rotation=15)
plt.ylabel('Frequency', size=14)
plt.title('Top 20 Unigrams Related to COVID-19 Transmission', size=18)
ax = plt.gca()
utils_vis.add_bar_value_labels(ax)
# plt.show()
plt.savefig('output/top20_unigrams.png')
# Bigrams
bigrams = utils_text.get_ngrams_df(df, 'abstract_processed', 2)
print('\nBigrams\n{}'.format(bigrams.head(n=300)), file=ngrams)
plt.figure(figsize=(20, 12))
x_axis = bigrams['N-Gram'][:20]
y_axis = bigrams['Frequency'][:20]
plt.bar(x_axis, y_axis, color='sienna')
plt.xlabel('Bigram', size=14)
plt.xticks(rotation=30)
plt.ylabel('Frequency', size=14)
plt.title('Top 20 Bigrams Related to COVID-19 Transmission', size=18)
ax = plt.gca()
utils_vis.add_bar_value_labels(ax)
# plt.show()
plt.savefig('output/top20_bigrams.png')
# Trigrams
trigrams = utils_text.get_ngrams_df(df, 'abstract_processed', 3)
print('\nTrigrams\n{}'.format(trigrams.head(n=300)), file=ngrams)
plt.figure(figsize=(20, 12))
x_axis = trigrams['N-Gram'][:20]
y_axis = trigrams['Frequency'][:20]
plt.bar(x_axis, y_axis, color='sienna')
plt.xlabel('Trigram', size=14)
plt.xticks(rotation=30)
plt.ylabel('Frequency', size=14)
plt.title('Top 20 Trigrams Related to COVID-19 Transmission', size=18)
ax = plt.gca()
utils_vis.add_bar_value_labels(ax)
plt.gcf().subplots_adjust(bottom=0.15)
# plt.show()
plt.savefig('output/top20_trigrams.png')
# 4-grams
four_grams = utils_text.get_ngrams_df(df, 'abstract_processed', 4)
print('\n4-grams\n{}'.format(four_grams.head(n=50)), file=ngrams)
# 5-grams
five_grams = utils_text.get_ngrams_df(df, 'abstract_processed', 5)
print('\n5-grams\n{}'.format(five_grams.head(n=50)), file=ngrams)
# 6-grams
six_grams = utils_text.get_ngrams_df(df, 'abstract_processed', 6)
print('\n6-grams\n{}'.format(six_grams.head(n=50)), file=ngrams)
ngrams.close()