-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
82 lines (66 loc) · 2.55 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import seaborn as sns
import sqlite3
import warnings
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
warnings.filterwarnings('ignore')
def cleanTxt(text):
text = re.sub('@[A-Za-z0–9]+', '', text) # Removing @mentions
text = re.sub('#', '', text) # Removing '#' hash tag
text = re.sub('RT[\s]+', '', text) # Removing RT
text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
return text
def visualiseSentiment(df):
sns.set(rc={'figure.figsize': (30, 2)})
sns.heatmap(pd.DataFrame(df).set_index("Text").T,center=0, annot=True, cmap = "PiYG")
plt.show()
def getScore(text):
clean_text = cleanTxt(text)
analysis = SentimentIntensityAnalyzer()
dict_score = analysis.polarity_scores(clean_text)
return dict_score
def vaderPolarity(score, threshold=0.05):
score = score.get('compound')
if score >= threshold:
return 'Positive'
elif -threshold < score < threshold:
return 'Neutral'
else:
return 'Negative'
def count_regex(pattern, text):
return len(re.findall(pattern, text))
def loadData():
conn = sqlite3.connect('../data/USAirlinesTweets.sqlite')
df = pd.read_sql_query("SELECT tweet_id, airline_sentiment, text FROM Tweets", conn)
df = df.dropna(subset=['text'])
df['airline_sentiment'] = df['airline_sentiment'].apply(lambda x: x.capitalize())
df_ = df.copy()
df_['count_words'] = df_.apply(lambda x: count_regex(r'\w+', x.text), axis=1)
def show_dist(df, col):
print(f'\n Statistics \n')
print(df.groupby('airline_sentiment')[col].describe())
print(f'Vocabulary Size {len(set(cleanTxt(" ".join(df.text.values)).split()))}')
print(f'\n Distribution \n')
bins = np.arange(df[col].min(), df[col].max() + 1)
g = sns.FacetGrid(df, col='airline_sentiment', height=5, hue='airline_sentiment', palette="magma")
g = g.map(sns.distplot, col, kde=True, norm_hist=True, bins=bins)
plt.show()
show_dist(df_, 'count_words')
return df
def vaderNormScoreFunc():
x = np.linspace(-5,5,100)
y = x/np.sqrt((x*x) + 15)
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
ax.spines['left'].set_position('center')
ax.spines['bottom'].set_position('center')
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.yaxis.set_ticks_position('left')
plt.plot(x,y, 'b')