-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdetox_engine.py
167 lines (131 loc) · 6.53 KB
/
detox_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
'''
Author : Wonhee Jung ( wonheej2@illinois.edu, wonhee.jung@gmail.com )
Since : Nov, 2018
UIUC MCS-DS CS410 Fall 2018 Project.
'''
import csv
import gc
import os
import os.path
import sys
import time
from builtins import range
import nltk
import numpy as np
import pandas as pd
from joblib import dump, load
from nltk import word_tokenize,sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
# includes all the defined constant variables.
import constant
os.environ['OMP_NUM_THREADS'] = '2'
# The class will create two joblib files to store the trained classifier and vectorizer so that it can be reused quickly
# instead of training it from the beginning.
#
# !!!! IMPORTANT !!!
#
# Which means if you ever chagne any code in __init__ that could affect the way how classifier/vectorizer is going to work,
# then you need to delete two files so that program recrete them. Those two files you need to delete are "classifier.joblib", and "vectorizer.joblib"
# defined in constant.py.
class ToxicityClassifier():
def __init__(self):
start_time = time.time()
self.stopwords = set(w.rstrip() for w in open('stopwords.txt'))
self.vectorizer = TfidfVectorizer(tokenizer=ToxicityClassifier.tokenizer, max_features=10000, stop_words=self.stopwords, analyzer='word', dtype=np.float32)
# creating the Multinomial Naive Bayes with Laplacian smoothing.
self.classifier = MultinomialNB()
# if and only if model doesn't exist in the file, execute this block, means you need to delete the existing model file to re-run this.
if os.path.exists(constant.CLASSIFIER_FILE) == False:
print("Can't find existing classifier stored in the file. Creating one...")
# for df in pd.read_csv(constant.TRAINING_DATA_PATH, delimiter=',', error_bad_lines=True, skipinitialspace=True, chunksize=constant.CVS_CHUNKSIZE, iterator=True):
dtypes = {
'toxic' : 'uint8',
'severe_toxic' : 'uint8',
'obscene' : 'uint8',
'threat' : 'uint8',
'insult' : 'uint8',
'identity_hate' : 'uint8'
}
df = pd.read_csv(constant.TRAINING_DATA_PATH, dtype=dtypes)
df = df.replace('\n','', regex=True)
toxic = \
df['toxic'] | \
df['severe_toxic'] | \
df['obscene'] | \
df['threat'] | \
df['insult'] | \
df['identity_hate']
training = self.vectorizer.fit_transform(df['comment_text'])
del df
gc.collect()
print("Initiating training...")
train_x, test_x, train_y, test_y = train_test_split(training.toarray(), toxic.values, test_size=0.2)
del training
gc.collect()
self.classifier.fit(train_x, train_y)
del train_x, train_y
gc.collect()
print("Completed training. Generating classification result...")
pred_y = self.classifier.predict(test_x)
del test_x
gc.collect()
print(classification_report(test_y, pred_y))
# store the classifier and vectorizer so it can be used later
print("Storing classifier and vectorizer into disk...")
dump(self.classifier, constant.CLASSIFIER_FILE)
dump(self.vectorizer, constant.VECTORIZER_FILE)
else:
print("Found existing classifier and vectorizer stored in the file. Loading...")
self.classifier = load(constant.CLASSIFIER_FILE)
self.vectorizer = load(constant.VECTORIZER_FILE)
# to measure programing execution time
print("--- time spent for initializing the classifier : %s seconds ---" % (time.time() - start_time))
# get string value as paraemter and return tokenized array after some data massage, cleansing, etc.
# returned token array won't have any word with length of one or two
@staticmethod
def tokenizer(s):
wordnet_lemmatizer = WordNetLemmatizer()
s = s.lower()
tokens = nltk.tokenize.word_tokenize(s)
tokens = [token for token in tokens if len(token) > 2] # remove words with two characters or less, those are probably pronoun and
tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens] # change the word to base form with NLTK's WordNetLematizer
return tokens
# with given parameter s, it returns whether s is toxic or not
# it is not expecting any arrays, it should be just single string value
def isToxic(self, s):
pred = self.classifier.predict( self.vectorizer.transform( np.array([s])).toarray() )
if pred[0] == 1:
return True
else:
return False
# main function if you need to run it separated, not through chatbot.py.
# The function will load local test CSV file and execute the prediction, instead of getting messages from TwitchTV channel it has deployed.
# With "test/test.csv", the code is going to take anothe 300MB +- to load all the text data and process it.
def main():
print("Initiating...")
# below file has smaller set of test data. Enable it instead if you want quicker testing
test_data_path = "data/test_sample.csv"
# below file has full set of test data. Try with it if you see more dresult. Beware : it will take some time.
#test_data_path = "data/test.csv"
toxicClassifier = ToxicityClassifier()
# loading chat logs from csv file
df = pd.read_csv(test_data_path, delimiter=',', error_bad_lines=True, skipinitialspace=True)
df = df.replace('\n','', regex=True)
# transform test data's chat log to the existing vecorizer so it can be used for prediction
data = toxicClassifier.vectorizer.transform( df['comment_text'] )
preds = toxicClassifier.classifier.predict( data )
# print(pd.DataFrame(preds, columns=toxicClassifier.classifier.classes_))
for i in range(len(preds)):
if preds[i] == 1:
print("TOXIC>>> " + df['comment_text'].values[i])
# just in case if need to run the test locally without TwitchBox working together
if __name__ == "__main__":
if not sys.version_info[:1] == (3,):
print(sys.version_info[:1] )
sys.stderr.write("Python version 3 is required.\n")
exit(1)
main()