-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess_wiki_files.py
133 lines (93 loc) · 3.88 KB
/
process_wiki_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from somajo import Tokenizer, SentenceSplitter
import os
from multiprocessing import Pool, cpu_count
INPUT_DIR = "data"
OUTPUT_DIR = "output"
PROCESS_DISCUSSION = False
def is_doc_start_line(line):
return line.startswith('<doc')
def is_doc_end_line(line):
return line.startswith('</doc')
def remove_discussion_suffix(sentence):
last_location = -1
for loc, token in enumerate(sentence):
if token == "--" or token == "--[" or token == "---":
last_location = loc
if last_location > -1:
sentence = sentence[:last_location]
return sentence
def get_data_dirs(root_dir):
result = []
for _, d_, _ in os.walk(root_dir):
for dir in d_:
result.append(dir)
return result
def process_text_line(line):
tokenizer = Tokenizer()
tokens = tokenizer.tokenize(line)
sentence_splitter = SentenceSplitter()
sentences = sentence_splitter.split(tokens)
result = []
for s in sentences:
if PROCESS_DISCUSSION:
s = remove_discussion_suffix(s)
if len(s) >= 4:
sentence_string = " ".join(s)
if PROCESS_DISCUSSION:
# check if this line still contains a dirty comment:
if "( CEST )" not in sentence_string and "( CET )" not in sentence_string:
result.append(sentence_string)
else:
result.append(sentence_string)
return result
def process_directory(input_dir, output_file):
with open(os.path.join(OUTPUT_DIR, output_file), 'a') as output_file:
# to avoid new line at end of file
first_line_written = False
# r_=root, d_=directories, f_=files
for r_, _, f_ in os.walk(input_dir):
for file_ in f_:
next_input_file = os.path.join(r_, file_)
print("Reading file:", next_input_file)
with open(next_input_file, "r") as input_file:
skip_next_line = False
for line in input_file:
# drop line with start tag
if is_doc_start_line(line):
skip_next_line = True
continue
# drop line with end tag
if is_doc_end_line(line):
continue
# skip first line to skip headline
if skip_next_line == True:
skip_next_line = False
continue
# skip empty lines
if len(line) <= 1:
continue
sentences = process_text_line(line)
for sentence in sentences:
# ignore blank lines and make sure that stuff like "\n" is also ignored:
if (PROCESS_DISCUSSION == False and len(sentence) > 2) or (PROCESS_DISCUSSION == True and len(sentence) > 72):
if first_line_written == True:
output_file.write("\n")
else:
first_line_written = True
output_file.write(sentence)
def pd(map_item):
"""Wrap call to process_directory to be called by map function"""
input_dir, output_file = map_item
print("Creating:", output_file)
process_directory(input_dir, output_file)
if __name__ == '__main__':
data_dirs = get_data_dirs(INPUT_DIR)
call_list = []
for dir in data_dirs:
call_item = [os.path.join(INPUT_DIR, dir), dir + ".txt"]
call_list.append(call_item)
pool_size = cpu_count() * 2
print("pool_size:", pool_size)
with Pool(pool_size) as p:
p.map(pd, call_list)
print("Done!")