-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathnlp_util.py
169 lines (131 loc) · 5.01 KB
/
nlp_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import logging
import re
import numpy as np
from copy import copy
logger = logging.getLogger(__name__)
def clean_text_for_corpus(text, min_frequency_num=3, min_sent_length=3):
"""
生成训练word2vector的语料
"""
# 过滤无向量单词、停词,统计词频
tokens_count = dict()
corpus_list = []
words_num = 0
for sent in text:
corpus = []
for token in clean_str(sent).split():
corpus.append(token)
tokens_count[token] = tokens_count.setdefault(token, 0) + 1
words_num += 1
corpus_list.append(corpus)
# low_frequency_num = int(words_num * 1e-4)
# 删除低频词
low_freq_tokens = {k for k, v in tokens_count.items()
if v < min_frequency_num}
corpus_list = [[t for t in s if t not in low_freq_tokens]
for s in corpus_list]
# 删除短句
# corpus_list = [x for x in corpus_list if len(x) > min_sent_length]
result = [' '.join(x) for x in corpus_list]
return result
# def split_sentence_spacy(text: str):
# if 'sentencizer' not in SpacyNLP().pipe_names:
# SpacyNLP().add_pipe(SpacyNLP().create_pipe('sentencizer'))
#
# doc = SpacyNLP()(text.lower())
# result = [x.text for x in doc.sents]
# return result
# def cut_sentence(text, max_length):
# doc = SpacyNLP()(text)
# if len(doc) > 0:
# return doc[:max_length].text
# else:
# return ''
def sentence_to_token_id_list(sentence, word2id):
tokens = sentence.split()
token_list = []
for t in tokens:
try:
token_list.append(word2id[t])
except KeyError:
pass
return token_list
def filter_unused_words(sentence, word2id):
tokens = sentence.split()
token_list = []
for t in tokens:
if t in word2id:
token_list.append(t)
return ' '.join(token_list)
# def delete_uncommon_words_punct(sentence, del_stop_word=True):\
#
# words = []
# for token in SpacyNLP()(sentence):
# if not token.is_alpha:
# continue
# if token.norm in SpacyNLP().vocab.vectors.key2row.keys():
# if del_stop_word:
# if token.is_stop:
# continue
#
# words.append(token.text)
#
# result = ' '.join(words)
# return result
def get_token_count(sentences):
token_count = dict()
for sent in sentences:
for word in sent.split():
token_count[word] = token_count.setdefault(word, 0) + 1
return token_count
# def get_spacy_word_embedding() -> np.ndarray:
# return SpacyNLP().vocab.vectors.data
def fuse_two_word_embedding(word2id_a, embed_a, word2id_b, embed_b):
word2id = copy(word2id_a)
embed = np.copy(embed_a)
for k, v in word2id_b.items():
if k not in word2id:
word2id[k] = len(word2id)
embed = np.vstack([embed, embed_b[v]])
return word2id, embed
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9',.!;?()]", " ", string)
string = re.sub(r"\.", " . ", string)
string = re.sub(r"!+", " ! ", string)
string = re.sub(r",", " , ", string)
string = re.sub(r";", " ; ", string)
string = re.sub(r"\\", " \\ ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " ( ", string)
string = re.sub(r"\)", " ) ", string)
string = re.sub(r"\?", " ? ", string)
string = re.sub(r"\s{2,}", " ", string)
string = re.sub(r"(\.|\s){7,}", " ... ", string)
string = re.sub(r"(?<= )(\w \. )+(\w \.)", lambda x: x.group().replace(" ", ""), string)
# string = re.sub(r"(\.|\s){4,}", " ... ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'m", " \'m", string)
string = re.sub(r"\'ll", " \'ll", string)
# string = re.sub(r"[^A-Za-z0-9']", " ", string)
string = re.sub(r"(?!(('(?=s\b))|('(?=ve\b))|('(?=re\b))|('(?=d\b))|('(?=ll\b))|('(?=m\b))|((?<=n\b)'(?=t\b))))'", " ", string)
# Glove style
# string = re.sub(' [0-9]{5,} ', ' ##### ', string)
# string = re.sub(' [0-9]{4} ', ' #### ', string)
# string = re.sub(' [0-9]{3} ', ' ### ', string)
# string = re.sub(' [0-9]{2} ', ' ## ', string)
string = re.sub(' 0 ', ' zero ', string)
string = re.sub(' 1 ', ' one ', string)
string = re.sub(' 2 ', ' two ', string)
string = re.sub(' 3 ', ' three ', string)
string = re.sub(' 4 ', ' four ', string)
string = re.sub(' 5 ', ' five ', string)
string = re.sub(' 6 ', ' six ', string)
string = re.sub(' 7 ', ' seven ', string)
string = re.sub(' 8 ', ' eight ', string)
string = re.sub(' 9 ', ' nine ', string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()