-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathex1_csv2txt.py
80 lines (72 loc) · 1.98 KB
/
ex1_csv2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# encoding=utf-8
import math
from collections import defaultdict
from nltk.corpus import stopwords
import nltk
import pandas as pd
import pickle
import config
maxlen = config.MAX_TEXT_LENGTH # max number of words in a comment to use
porter = nltk.PorterStemmer()
wnl= nltk.WordNetLemmatizer()
toxic_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
word_dict={}
i = 0
badwords = config.badwords
stop_words = config.stop_words
print(len(badwords))
N = 0
all_word = 0
all_class = 0
outf = open("./train_corps.txt",'w',encoding='utf-8')
train = pd.read_csv('data/train_valid.csv')
for index, row in train.iterrows():
line = row['comment_text']
tokens = nltk.word_tokenize(line)
len=0
for t in tokens:
word=t
# word = wnl.lemmatize(t)
# word = porter.stem(word)
word = word.lower()
word = word.replace('\t', '')
word = word.replace('\n', '')
if word in stop_words:
continue
if word in badwords:
word = badwords[word]
if word not in word_dict:
word_dict[word]=i
i+=1
outf.write(word)
outf.write(' ')
len+=1
if len> maxlen :
break
outf.write("\n")
outf.close()
outf = open("./test_corps.txt",'w',encoding='utf-8')
train = pd.read_csv("./data/test.csv")
train=train["comment_text"].fillna("MISSINGVALUE").values
for line in train:
tokens = nltk.word_tokenize(line)
for t in tokens:
word=wnl.lemmatize(t)
# word = porter.stem(word)
word = word.lower()
word = word.replace('\t', '')
word = word.replace('\n', '')
if word in stop_words:
continue
if word in badwords:
word = badwords[word]
if word not in word_dict:
word_dict[word]=i
i+=1
outf.write(word)
outf.write(' ')
outf.write("\n")
outf.close()
#
# with open('word_dict.pkl3', 'wb') as f:
# pickle.dump(word_dict, f)