-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
86 lines (77 loc) · 3.36 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: UTF-8 -*-
#autor:Oliver
import jieba
class preprocessing():
__PAD__ = 0#填充符
__EOS__ = 1#结束符
__GO__ = 2#开始符
__UNK__ = 3#未知符
vocab = ['__PAD__', '__EOS__', '__GO__','__UNK__']
def __init__(self):
self.encoderFile = "./data/question.txt"#问题
self.decoderFile = "./data/answer.txt"#回答
self.savePath = './data/'#储存路径
jieba.load_userdict("./data/supplementvocab.txt")#选择jieba的中文分词字典
def wordToVocabulary(self, originFile, vocabFile, segementFile):
vocabulary = []
sege = open(segementFile, "w",encoding='utf-8')
with open(originFile, 'r',encoding='utf-8') as en:
for sent in en.readlines():
if "enc" in segementFile:
words = jieba.lcut(sent.strip())#jieba分词,返回列表
print(words)
else:
words = jieba.lcut(sent.strip())
vocabulary.extend(words)#初步形成字典
for word in words:#储存每行分词结果
sege.write(word+" ")
sege.write("\n")
sege.close()
# 去重并存入词典
vocab_file = open(vocabFile, "w",encoding='utf-8')
_vocabulary = list(set(vocabulary))
_vocabulary.sort(key=vocabulary.index)
_vocabulary = self.vocab + _vocabulary#加入特殊符号形成最终字典
if "enc" in segementFile:
print('encode_vocab_length: ',len(_vocabulary))
else:
print('decode_vocab_length: ',len(_vocabulary))
for index, word in enumerate(_vocabulary):
vocab_file.write(word+"\n")
vocab_file.close()
def toVec(self, segementFile, vocabFile, doneFile):
word_dicts = {}
vec = []
with open(vocabFile, "r",encoding='utf-8') as dict_f:#将字典封装成索引词表
for index, word in enumerate(dict_f.readlines()):
word_dicts[word.strip()] = index
f = open(doneFile, "w",encoding='utf-8')
#如果单独或者连续输入未知符号,则回答未知符号
if "enc.vec" in doneFile:
f.write("3 3 3 3\n")
f.write("3\n")
elif "dec.vec" in doneFile:
f.write(str(word_dicts.get("other", 3))+"\n")
f.write(str(word_dicts.get("other", 3))+"\n")
with open(segementFile, "r",encoding='utf-8') as sege_f:
for sent in sege_f.readlines():
sents = [i.strip() for i in sent.split(" ")[:-1]]
vec.extend(sents)
for word in sents:
f.write(str(word_dicts.get(word))+" ")#将字词转为索引号
f.write("\n")
f.close()
def main(self):
# 获得字典
self.wordToVocabulary(self.encoderFile, self.savePath+'enc.vocab', self.savePath+'enc.segement')
self.wordToVocabulary(self.decoderFile, self.savePath+'dec.vocab', self.savePath+'dec.segement')
# 转向量
self.toVec(self.savePath+"enc.segement",
self.savePath+"enc.vocab",
self.savePath+"enc.vec")
self.toVec(self.savePath+"dec.segement",
self.savePath+"dec.vocab",
self.savePath+"dec.vec")
if __name__ == '__main__':
pre = preprocessing()
pre.main()