-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_util.py
140 lines (121 loc) · 4.52 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pickle
import numpy as np
import OpenHowNet
from keras.preprocessing.text import Tokenizer
import os
def tokenize_corpus(path = 'data/corpus.txt'):
"""Tokenizes a text file."""
assert os.path.exists(path)
# Add words to the dictionary
sentence_list = []
pos_list = []
with open(path, 'r', encoding="utf-8") as f:
tokens = 0
for line in f:
items = line.split()[1:]
words = [x.split('/')[0] for x in items]
poses = [x.split('/')[1] for x in items]
sentence_list.append(words)
pos_list.append(poses)
return sentence_list,pos_list
def build_vocab():
sentence_list,_ = tokenize_corpus()
vocab_dict = {}
for sentence in sentence_list:
words = sentence
for word in words:
if word not in vocab_dict:
vocab_dict[word] = 1 ##[word,pos]
else:
vocab_dict[word] += 1
print("vocabulary table size: %d"%(len(vocab_dict)))
return vocab_dict
def gen_sem_dict(vocab_dict):
check_pos_list = ['noun', 'verb', 'adj', 'adv']
word_pos = {}
word_sem = {}
hownet_dict = OpenHowNet.HowNetDict()
for word,count in vocab_dict.items():
word_pos[word] = []
word_sem[word] = {}
tree = hownet_dict.get_sememes_by_word(word,structured=True,lang='zh',merge= False)
sememes = hownet_dict.get_sememes_by_word(word,structured= False,lang='zh',merge= False)
sem_list = [x['sememes'] for x in sememes]
pos_list = [x['word']['ch_grammar'] for x in tree]
assert len(pos_list) == len(sem_list),"%d, %d"%(len(sem_list),len(pos_list))
valid_pos = []
for i in range(len(pos_list)):
sem = sem_list[i]
pos = pos_list[i]
if pos in check_pos_list:
valid_pos.append(pos)
if pos not in word_sem[word]:
word_sem[word][pos] = [sem]
else:
if sem not in word_sem[word][pos]:
word_sem[word][pos].append(sem)
word_pos[word] = set(valid_pos)
return word_sem,word_pos
def gen_id2sem(word_sem):
id_to_sem = {}
for word, pos2sems in word_sem.items():
id_to_sem[word] = {}
for pos,sem_list in pos2sems.items():
id_to_sem[word][pos] = {}
for i in range(len(sem_list)):
sem_set = sem_list[i]
id_to_sem[word][pos][i] = sem_set
return id_to_sem
def compare_list(list1,list2):
for item in list1:
assert type(item) == set,'wrong type!'
if item in list2:
return True
return False
def add_word(word,vocab_list,word_candidate,id_to_sem,word_sem,word_pos):
word_candidate[word] = {}
if len(word_pos[word]) == 0:
return
orig_word_pos = word_pos[word]
for pos in orig_word_pos:
word_candidate[word][pos] = {}
for sub_word in vocab_list:
sub_word_pos = word_pos[sub_word]
if len(sub_word_pos&orig_word_pos) == 0:
continue
common_pos = sub_word_pos&orig_word_pos
for pos in common_pos:
sub_sem_list = word_sem[sub_word][pos]
id2sem = id_to_sem[word][pos]
# for i in range(len(id2sem)):
for i in range(len(id2sem)):
if i not in word_candidate[word][pos]:
word_candidate[word][pos][i] = []
sem_set = id2sem[i]
if sem_set in sub_sem_list:
# print(sub_word)
word_candidate[word][pos][i].append(sub_word)
if __name__ == '__main__':
word_candidate = {}
vocab_dict = build_vocab()
word_sem,word_pos = gen_sem_dict(vocab_dict)
id_dict = gen_id2sem(word_sem)
vocab_list = list(vocab_dict.keys())
count = 0
num_words = len(vocab_list)
for word in vocab_list:
count += 1
print(count,'/',num_words)
add_word(word,vocab_list,word_candidate,id_dict,word_sem,word_pos)
test_word = '把握'
# for test_word in word_list:
subsets = word_candidate[test_word]
for pos,subwords in subsets.items():
print("pos: ",pos)
print("subwords: ",subwords)
print()
print(id_dict[test_word])
with open("aux_files/word_candidate.pkl",'wb') as f:
pickle.dump(word_candidate,f)
with open("aux_files/senseid.pkl",'wb') as f:
pickle.dump(id_dict,f)