-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathget_data.py
executable file
·158 lines (116 loc) · 4.16 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#*- coding: utf-8 -*-
import re
import thulac
def parser_and_return_data_list(filepath):
f = open(filepath)
list_summarization = []
list_article = []
for i,line in enumerate(f.readlines()):
temp_dic = eval(line) # eval:line to dic
list_summarization.append(temp_dic["summarization"])
list_article.append(temp_dic["article"])
return list_summarization , list_article
def get_clean_data_list(filepath):
list_summarization , list_article = parser_and_return_data_list(filepath)
def _remove_special_char(m):
s = m.group(0)
if s in u',。!?;:“”《》':
return s
return ''
for i,line in enumerate(list_summarization):
line = line.decode("utf8")
list_summarization[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]',\
_remove_special_char,line)
for i,line in enumerate(list_article):
line = line.decode("utf8")
line = re.sub(u'<Paragraph>','',line)
list_article[i] = re.sub(u'[^\u4e00-\u9fa50-9a-zA-Z]',\
_remove_special_char,line)
return list_summarization , list_article
def write_cut_word_to_file():
filepath_summ = "./data/train_with_summ.txt"
filepath_no_summ = "./data/train_without_summ.txt"
_,list_article = get_clean_data_list(filepath_summ) #50000 unicode
_,list_no_summ_article = get_clean_data_list(filepath_no_summ) #50000 unicode
f_article = open("./data/cut_article.txt","w+")
f_no_summ_article = open("./data/cut_no_summ_article.txt","w+")
thu_cut = thulac.thulac("-seg_only")
for i,article in enumerate(list_article):
article = article.encode("utf8")
list_temp = thu_cut.cut(article)
try:
content = " ".join(list_temp)
except:
print i
content = "wrong context"
f_article.write(content+"\n")
f_article.close()
for i,article in enumerate(list_no_summ_article):
article = article.encode("utf8")
try:
list_temp = thu_cut.cut(article)
content = " ".join(list_temp)
except:
print i
content = "wrong context"
f_no_summ_article.write(content+"\n")
f_no_summ_article.close()
def write_test_cut_word_to_file():
thu_cut = thulac.thulac("-seg_only")
filepath_test_data = "./data/test_data/evaluation_without_ground_truth.txt"
_,list_test_data = get_clean_data_list(filepath_test_data)
f_test_data = open("./data/test_data/cut_test.txt","w+")
for i,article in enumerate(list_test_data):
article = article.encode("utf8")
list_temp = thu_cut.cut(article)
content = " ".join(list_temp)
f_test_data.write(content+"\n")
f_test_data.close()
def get_all_cut_short_text():
f_article = open("./data/cut_article.txt",'r')
f_no_summ_article = open("./data/cut_no_summ_article.txt",'r')
f_test = open("./data/test_data/cut_test.txt")
list_cut_short_text = []
for i,line in enumerate(f_article):
temp_list = line.split(' ')
list_cut_short_text.append(temp_list)
#print len(list_cut_short_text)
for i,line in enumerate(f_no_summ_article):
temp_list = line.split(' ')
list_cut_short_text.append(temp_list)
for i,line in enumerate(f_test):
temp_list = line.split(' ')
list_cut_short_text.append(temp_list)
#print len(list_cut_short_text)
return list_cut_short_text
def division_train_and_test_data():
filepath = './data/cut_article.txt'
f_article = open(filepath,'r')
f_new_train = open("./data/cut_article_train.txt",'w+')
f_new_test = open("./data/cut_article_test.txt",'w+')
for i,line in enumerate(f_article):
if i < 45000:
f_new_train.write(line)
if i >= 45000:
f_new_test.write(line)
f_article.cloee()
f_new_train.close()
f_new_test.close()
def get_cut_data_list_list(filepath):
list_article = open(filepath,'r')
list_test_article = []
for i , line in enumerate(list_article):
list_test_article.append(line.split(" "))
return list_test_article
def get_test_summary():
list_summary , _ = parser_and_return_data_list("data/train_with_summ.txt")
return list_summary[45000:]
if __name__ == '__main__':
#get_clean_data_list("data/train_with_summ.txt")
#write_cut_word_to_file()
#get_all_cut_short_text()
#division_train_and_test_data()
#list_test_article = get_cut_data_list_list("./data/cut_article_test.txt")
#print len(list_test_article) ,type(list_test_article[0]) ,list_test_article[0]
#list = get_test_summary()
write_test_cut_word_to_file()