-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathutils.py
247 lines (218 loc) · 9.74 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import numpy as np
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score, confusion_matrix, auc
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from textblob import Word
from nltk.corpus import stopwords
import re
import nltk
import pandas as pd
import requests
#!/usr/bin/env python
import config
import matplotlib.pyplot as plt
def group_list(lst, size=100):
"""
Generate batches of 100 ids in each
Returns list of strings with , seperated ids
"""
new_list =[]
idx = 0
while idx < len(lst):
new_list.append(
','.join([str(item) for item in lst[idx:idx+size]])
)
idx += size
return new_list
def tweets_request(tweets_ids):
"""
Make a requests to Tweeter API
"""
df_lst = []
for batch in tqdm(tweets_ids):
url = "https://api.twitter.com/2/tweets?ids={}&tweet.fields=created_at&expansions=author_id&user.fields=created_at".format(batch)
payload={}
headers = {'Authorization': 'Bearer ' + config.keys['bearer_token'],
'Cookie': 'personalization_id="v1_hzpv7qXpjB6CteyAHDWYQQ=="; guest_id=v1%3A161498381400435837'}
r = requests.request("GET", url, headers=headers, data=payload)
data = r.json()
if 'data' in data.keys():
df_lst.append(pd.DataFrame(data['data']))
return pd.concat(df_lst)
def aps(X, y, model):
"""
Function to calculate PR AUC Score based on predict_proba(X)
where X is feature values, y is target values, and model is instantiated model variable
"""
probs = model.predict_proba(X)[:,1]
return average_precision_score(y, probs)
def get_metrics(X_tr, y_tr, X_val, y_val, y_pred_tr, y_pred_val, model):
"""
Function to get training and validation F1, recall, precision, PR AUC scores
Instantiate model and pass the model into function
Pass X_train, y_train, X_val, Y_val datasets
Pass in calculated model.predict(X) for y_pred
"""
f1_tr = f1_score(y_tr, y_pred_tr)
f1_val = f1_score(y_val, y_pred_val)
rc_tr = recall_score(y_tr, y_pred_tr)
rc_val = recall_score(y_val, y_pred_val)
pr_tr = precision_score(y_tr, y_pred_tr)
pr_val = precision_score(y_val, y_pred_val)
aps_tr = aps(X_tr, y_tr, model)
aps_val = aps(X_val, y_val, model)
print('Training F1 Score: ', f1_tr)
print('Validation F1 Score: ', f1_val)
print('Training Recall Score: ', rc_tr)
print('Validation Recall Score: ', rc_val)
print('Training Precision Score: ', pr_tr)
print('Validation Precision Score: ', pr_val)
print('Training Average Precision Score: ', aps_tr)
print('Validation Average Precision Score: ', aps_val)
def conf_matrix(y_test, y_hat_test):
cnf = confusion_matrix(y_test, y_hat_test)
group_names = ['TN','FP','FN','TP']
group_counts = ['{0:0.0f}'.format(value) for value in cnf.flatten()]
group_percentages = ['{0:.2%}'.format(value) for value in cnf.flatten()/np.sum(cnf)]
labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in zip(group_names, group_counts, group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cnf, annot=labels, fmt='', cmap='Blues', annot_kws={'size':16})
# courtesy of DTrimarchi10 on Github
def make_confusion_matrix(cf, X, y, model,
group_names=['TN','FP','FN','TP'],
categories='auto',
count=True,
percent=True,
cbar=True,
xyticks=True,
xyplotlabels=True,
sum_stats=True,
figsize=None,
cmap='Blues',
title=None):
'''
This function will make a pretty plot of an sklearn Confusion Matrix cm using a Seaborn heatmap visualization.
Arguments
---------
cf: confusion matrix to be passed in
group_names: List of strings that represent the labels row by row to be shown in each square.
categories: List of strings containing the categories to be displayed on the x,y axis. Default is 'auto'
count: If True, show the raw number in the confusion matrix. Default is True.
normalize: If True, show the proportions for each category. Default is True.
cbar: If True, show the color bar. The cbar values are based off the values in the confusion matrix.
Default is True.
xyticks: If True, show x and y ticks. Default is True.
xyplotlabels: If True, show 'True Label' and 'Predicted Label' on the figure. Default is True.
sum_stats: If True, display summary statistics below the figure. Default is True.
figsize: Tuple representing the figure size. Default will be the matplotlib rcParams value.
cmap: Colormap of the values displayed from matplotlib.pyplot.cm. Default is 'Blues'
See http://matplotlib.org/examples/color/colormaps_reference.html
title: Title for the heatmap. Default is None.
'''
# CODE TO GENERATE TEXT INSIDE EACH SQUARE
blanks = ['' for i in range(cf.size)]
if group_names and len(group_names)==cf.size:
group_labels = ["{}\n".format(value) for value in group_names]
else:
group_labels = blanks
if count:
group_counts = ["{0:0.0f}\n".format(value) for value in cf.flatten()]
else:
group_counts = blanks
if percent:
group_percentages = ["{0:.2%}".format(value) for value in cf.flatten()/np.sum(cf)]
else:
group_percentages = blanks
box_labels = [f"{v1}{v2}{v3}".strip() for v1, v2, v3 in zip(group_labels,group_counts,group_percentages)]
box_labels = np.asarray(box_labels).reshape(cf.shape[0],cf.shape[1])
# CODE TO GENERATE SUMMARY STATISTICS & TEXT FOR SUMMARY STATS
if sum_stats:
#Accuracy is sum of diagonal divided by total observations
accuracy = np.trace(cf) / float(np.sum(cf))
#if it is a binary confusion matrix, show some more stats
if len(cf)==2:
#Metrics for Binary Confusion Matrices
precision = cf[1,1] / sum(cf[:,1])
recall = cf[1,1] / sum(cf[1,:])
f1_score = 2*precision*recall / (precision + recall)
pr_auc = aps(X, y, model)
stats_text = "\n\nPrecision={:0.3f}\nRecall={:0.3f}\nF1 Score={:0.3f}\nPR AUC Score={:0.3f}".format(
precision,recall,f1_score, pr_auc)
else:
stats_text = "\n\nAccuracy={:0.3f}".format(accuracy)
else:
stats_text = ""
# SET FIGURE PARAMETERS ACCORDING TO OTHER ARGUMENTS
if figsize==None:
#Get default figure size if not set
figsize = plt.rcParams.get('figure.figsize')
if xyticks==False:
#Do not show categories if xyticks is False
categories=False
# MAKE THE HEATMAP VISUALIZATION
plt.figure(figsize=figsize)
sns.heatmap(cf,annot=box_labels,fmt="",cmap=cmap,cbar=cbar,xticklabels=categories,yticklabels=categories)
if xyplotlabels:
plt.ylabel('True label')
plt.xlabel('Predicted label' + stats_text)
else:
plt.xlabel(stats_text)
if title:
plt.title(title)
def num_of_words(df, col):
df['word_ct'] = df[col].apply(lambda x: len(str(x).split(" ")))
print(df[[col, 'word_ct']])
def num_of_chars(df, col):
df['char_ct'] = df[col].str.len()
print(df[[col, 'char_ct']])
def avg_word(sentence):
words = sentence.split()
return (sum(len(word) for word in words)/len(words))
def avg_word_length(df, col):
df['avg_wrd'] = df[col].apply(lambda x: avg_word(x))
print(df[[col, 'avg_wrd']].head())
def hash_tags(df, col):
df['hashtags'] = df[col].apply(lambda x: len(re.split(r'#', str(x)))-1)
print(df[[col, 'hashtags']].head())
def preprocess_tweet(df, col):
"""
Remove callouts, character references (HTML characters, emojis), # in hashtags,
Remove Twitter code RT and QT, URL links, punctuation, excess whitespace between
Lowercase all words and remove leading and trailing whitespaces
"""
df[col] = df[col].apply(lambda x: re.sub(r'@[\S]+', ' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'&[\S]+?;', ' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'#', ' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'(\bRT\b|\bQT\b)', ' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'http[\S]+', ' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'[^\w\s]', r'', str(x)))
df[col] = df[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
df[col] = df[col].apply(lambda x: re.sub(r'\w*\d\w*', r' ', str(x)))
df[col] = df[col].apply(lambda x: re.sub(r'\s\s+', ' ', str(x)))
def tokenize(df, col):
"""
Function to tokenize column of strings without punctuation
Input into word_tokenize() must be string with spaces only
Output is a list of tokenized words
"""
text = ' '.join(df[col].to_list())
tokens = nltk.word_tokenize(text)
return tokens
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def no_stopwords(text):
lst = [word for word in text if word not in stop_words]
return lst
def term_frequency(df):
tf1 = (df['tweet'].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index())
tf1.columns = ['words', 'tf']
tf1 = tf1.sort_values(by='tf', ascending=False).reset_index()
return tf1
def stemming(token_list):
ss = PorterStemmer()
lst = [ss.stem(w) for w in token_list]
return lst
def lemmatization(df):
df['lem'] = df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
return df['lem'].head()