-
Notifications
You must be signed in to change notification settings - Fork 77
/
Copy pathtokenization.py
56 lines (43 loc) · 1.79 KB
/
tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import re
from typing import List
from collections import defaultdict
import multiprocessing
from tqdm import tqdm
from pyhanlp import *
import spacy
LANG_CLS = defaultdict(lambda:"SpacyTokenizer")
LANG_CLS.update({
"zh": "HanLPTokenizer",
"en": "SpacyTokenizer",
})
SPACY_MODEL = {
"en": "en_core_web_sm",
"ja": "ja_core_news_sm"
}
class HanLPTokenizer(object):
def __init__(self, stopwords=None):
self.pat = re.compile(r'[0-9!"#$%&\'()*+,-./:;<=>?@—,。:★、¥…【】()《》?“”‘’!\[\\\]^_`{|}~\u3000]+')
self.stopwords = stopwords
print("Using HanLP tokenizer")
def tokenize(self, lines: List[str]) -> List[List[str]]:
docs = []
for line in tqdm(lines):
tokens = [t.word for t in HanLP.segment(line)]
tokens = [re.sub(self.pat, r'', t).strip() for t in tokens]
tokens = [t for t in tokens if t != '']
if self.stopwords is not None:
tokens = [t for t in tokens if not (t in self.stopwords)]
docs.append(tokens)
return docs
class SpacyTokenizer(object):
def __init__(self, lang="en", stopwords=None):
self.stopwords = stopwords
self.nlp = spacy.load(SPACY_MODEL[lang], disable=['ner', 'parser'])
print("Using SpaCy tokenizer")
def tokenize(self, lines: List[str]) -> List[List[str]]:
docs = self.nlp.pipe(lines, batch_size=1000, n_process=multiprocessing.cpu_count())
docs = [[token.lemma_ for token in doc if not (token.is_stop or token.is_punct)] for doc in docs]
return docs
if __name__ == '__main__':
tokenizer=HanLPTokenizer()
print(tokenizer.tokenize(['他拿的是《红楼梦》?!我还以为他是个Foreigner———']))