-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathemotions.py
91 lines (71 loc) · 3.05 KB
/
emotions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from tqdm.notebook import tqdm
from nltk import word_tokenize
from nltk import tokenize
from nltk.corpus import stopwords
def text_emotion(df, column):
'''
Takes a DataFrame and a specified column of text and adds 10 columns to the
DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each
column containing the value of the text in that emotions
INPUT: DataFrame, string
OUTPUT: the original DataFrame with ten new columns
'''
new_df = df.copy()
emolex_df = pd.read_csv("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt",
names=["word", "emotion", "association"],
sep='\t')
emolex_words = emolex_df.pivot(index='word',
columns='emotion',
values='association').reset_index()
emotions = emolex_words.columns.drop('word')
emo_df = pd.DataFrame(0, index=df.index, columns=emotions)
stemmer = SnowballStemmer("english")
with tqdm(total=len(list(new_df.iterrows()))) as pbar:
for i, row in new_df.iterrows():
pbar.update(1)
try:
document = word_tokenize(new_df.loc[i][column])
for word in document:
try:
word = stemmer.stem(word.lower())
emo_score = emolex_words[emolex_words.word == word]
if not emo_score.empty:
for emotion in list(emotions):
emo_df.at[i, emotion] += emo_score[emotion]
except:
print("Error at: ", row)
pass
except:
print("Error at: ", row)
pass
new_df = pd.concat([new_df, emo_df], axis=1)
return new_df
def number_of_words(sentence):
try:
x = 0
tok = tokenize.word_tokenize(sentence)
for word in tok:
if word in stop_words:
x += 1
return len(tok) - x
except:
return None
pass
stop_words = set(stopwords.words('english'))
df = pd.read_csv("tweets_sentiment_score.csv", usecols=["Author", "Date", "Tweet_English"]
,dtype={'Author': object, 'Date': object, 'Tweet_English': object})
#df = df[0:50]
df_emotions = text_emotion(df, "Tweet_English")
df_emotions['word_count'] = df.apply(lambda x: number_of_words(x['Tweet_English']), axis=1)
emotions = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust']
for emotion in emotions:
df_emotions[emotion] = df_emotions[emotion] / df_emotions['word_count']
df_unique = df_emotions.drop_duplicates(['Date'])
df_unique = df_unique.drop_duplicates(['Tweet_English'])
df_unique = df_unique[df_unique.Author != 'gabrielrufian'] #todo delete it in the future
df_unique = df_unique.dropna(axis='index')
df_unique.to_csv('tweets_emotions_score.csv', index=False)
print(df_unique)
print("Cé finit!")