-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocess_wiki.py
116 lines (94 loc) · 3.58 KB
/
preprocess_wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import codecs
import pickle
from collections import defaultdict
import sys
import bz2
import os.path
from pprint import pprint
import nltk
import gensim.corpora
import re
sent_detector = nltk.data.load('/home/erlenda/nltk_data/tokenizers/punkt/norwegian.pickle')
nl_re=re.compile('([\n]){2,}',flags=re.MULTILINE|re.UNICODE)
fnutt_replace=re.compile('([\']){2,}',flags=re.MULTILINE|re.UNICODE)
re_space_insert=re.compile('([\?\.\,\:\;\!\'="_$%&@#\(\)\[\]\{\}\*|«»])',flags=re.MULTILINE|re.UNICODE)
re_multispace=re.compile('[ ]+',flags=re.MULTILINE|re.UNICODE)
def preproc_for_tok(txt):
txt=re_space_insert.sub(r' \1 ',txt)
txt=re_multispace.sub(' ',txt)
return txt
if __name__ == '__main__':
"""
Usage:
python3 preprocess_wiki.py /path/to/bz2/wikimedia/dump
Output:
xml pages of wiki (in data/wikipages)
"""
print(sys.argv[1])
if not os.path.isfile('./data/wikipages/wiki_pages.pickle'):
print('joy1')
texts = list([(text, title, pageid) for title, text, pageid in gensim.corpora.wikicorpus.extract_pages(bz2.BZ2File(sys.argv[1]))])
with open('./data/wikipages/wiki_pages.pickle',mode='wb') as ff:
pickle.dump(texts,ff,protocol=4)
elif not os.path.isfile('./data/wikipages/wiki_pages_raw.pickle'):
print('joy2')
with open('./data/wikipages/wiki_pages.pickle',mode='rb') as ff:
texts=pickle.load(ff)
texts2=[]
for text, title, pageid in texts:
text=gensim.corpora.wikicorpus.filter_wiki(text)
texts2.append((text,title,pageid))
with open('./data/wikipages/wiki_pages_raw.pickle',mode='wb') as ff:
pickle.dump(texts2,ff,protocol=4)
elif not os.path.isfile('./data/wikipages/wiki_pages_raw_processed.pickle'):
print('joy3')
with open('./data/wikipages/wiki_pages_raw.pickle',mode='rb') as ff:
texts=pickle.load(ff)
# tt=nl_re.sub(r'\1\1',texts[4][0])
# print(tt)
texts2=[]
for text, title, pageid in texts:
if text.startswith('#REDIRECT'):
continue
text=nl_re.sub(r'\1\1',text)
text=fnutt_replace.sub(r'\1',text)
texts2.append((text,title,pageid))
with open('./data/wikipages/wiki_pages_raw_processed.pickle',mode='wb') as ff:
pickle.dump(texts2,ff,protocol=4)
else:
print('joy4')
with open('./data/wikipages/wiki_pages_raw_processed.pickle',mode='rb') as ff:
texts=pickle.load(ff)
#print(text_preproc_for_tok(texts[1][0]))
toklist=[]
acc_sents=[]
iii=0
no_starts=['==','*','(','Kategori:','ISBN']
def txt_startswith(txt,nostarts):
low_st=txt[0].lower()
for tt in nostarts:
a=txt.startswith(tt)
if a:
return a
if txt.startswith(low_st):
return True
if (txt.find('Kategori:') !=-1) or (txt.find('==') !=-1) or (txt.find('\n') !=-1):
return True
return False
for text, title, pageid in texts:
sents=sent_detector.tokenize(text)
for sent in sents:
if not ( txt_startswith(sent,no_starts) ):
acc_sent=preproc_for_tok(sent)
acc_sents.append(acc_sent)
iii+=1
if iii%5000==0:
print(acc_sent)
print('')
# text=text_preproc_for_tok(text).lower()
# toks=text.split()
# toklist.extend(toks)
with open('./data/wikipages/sentlist.pickle',mode='wb') as ff:
pickle.dump(acc_sents,ff)