-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathextract_keywords_and_cal_summary.py
executable file
·125 lines (101 loc) · 3.46 KB
/
extract_keywords_and_cal_summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#-*- coding:utf-8 -*-
from gensim import models,corpora
from textrank4zh import TextRank4Keyword
import get_data
import re
import heapq
def get_max_k_tuple_list (tuple_list , k):
return heapq.nlargest(k , tuple_list , key = lambda x : x[1])
def get_stopwords_list():
filepath = "./stopword.txt"
f = open(filepath)
stopwords_list = []
for i,line in enumerate(f.readlines()):
stopwords_list.append(line.decode("utf8"))
f.close()
return stopwords_list
def get_index_of_summary(dic, model_tfidf, corpus_list, k, list_word):
stopwords_list = get_stopwords_list()
corpus_list = [word_tuple for word_tuple in corpus_list \
if dic.get(word_tuple[0]) not in stopwords_list and not re.match("^\d*$",dic.get(word_tuple[0]))]
k = len(list_word)/k
list_max_k = get_max_k_tuple_list(model_tfidf[corpus_list],k)
list_max_word = [dic.get(t[0]) for t in list_max_k]
s = " ".join(list_word).replace("\n","")
cal_list = []
for i,sen in enumerate(re.split(',|。|:|;|?|!',s)):
sen_list = sen.split(' ')
temp_list = []
temp_value = 0.0
n = 0
for j , word in enumerate(sen_list):
if word.decode("utf8") in list_max_word:
temp_list.insert(j,1)
else:
temp_list.insert(j,0)
length = 0
for k in temp_list:
length += 1
if k==1:
n += 1
try:
temp_value = n*n*1.0/length
except:
temp_value = 0
sen = ''.join(sen.split())
cal_list.append((i,temp_value,sen))
cal_list = sorted(cal_list,key=lambda x : (-x[1],x[0]))
all_size = 0
ans_list = []
for t in cal_list:
if all_size+len(t[2].decode("utf8"))+1 <= 60 and t[1]>0:
ans_list.append(t)
all_size+=(len(t[2].decode("utf8"))+1)
'''
if all_size+len(t[2].decode("utf8")) <= 60 and t[1]>0:
ans_list.append(t)
all_size+=(len(t[2].decode("utf8")))
'''
ans_list = sorted(ans_list,key=lambda x : (x[0]))
ans = ""
for i,t in enumerate(ans_list):
if i == len(ans_list)-1:
ans+=t[2]
ans+="。"
else:
ans+=t[2]
ans+=","
#ans+=t[2]
return ans
def use_tfidf_cal_summary( test_filepath , result_filepath , k):
#dic = corpora.Dictionary.load("./model/dictionary.tfidf.dic")
#model_tfidf = models.TfidfModel.load("./model/tfidf_model")
dic = corpora.Dictionary.load("./model/test_dictionary.tfidf.dic")
model_tfidf = models.TfidfModel.load("./model/test_tfidf_model")
list_test_article = get_data.get_cut_data_list_list(test_filepath)
corpus = [dic.doc2bow(text) for text in list_test_article]
result_f = open(result_filepath,"w+")
for i , tuple_list in enumerate(corpus):
ans = get_index_of_summary(dic,model_tfidf,tuple_list,k,list_test_article[i])
print i,ans
result_f.write(ans+"\n")
result_f.close()
if __name__ == "__main__":
#test_filepath = "./data/cut_article_test.txt"
test_filepath = "./data/test_data/cut_test.txt"
result_filepath = "./result/test_data_result_tfidf_k23.txt"
use_tfidf_cal_summary(test_filepath, result_filepath,23)
'''
for k in range(5,16):
result_filepath = "./result/EK_tfidf_result/0504_k=%d.txt"%(k)
use_tfidf_cal_summary(test_filepath , result_filepath , k)
for k in range(16,21):
result_filepath = "./result/EK_tfidf_result/0504_k=%d.txt"%(k)
use_tfidf_cal_summary(test_filepath , result_filepath , k)
for k in range(21,26):
result_filepath = "./result/EK_tfidf_result/0504_k=%d.txt"%(k)
use_tfidf_cal_summary(test_filepath , result_filepath , k)
for k in range(26,31):
result_filepath = "./result/EK_tfidf_result/0504_k=%d.txt"%(k)
use_tfidf_cal_summary(test_filepath , result_filepath , k)
'''