-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbtsja_parser.py
246 lines (229 loc) · 8.93 KB
/
btsja_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import re
import csv
from bs4 import BeautifulSoup
def map_roman_nums(roman_num):
'''
Map roman numbers (used in original dictionary)
'''
roman_num_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5,
'VI': 6, 'VII': 7, 'VIII': 8, 'IX': 9, 'X': 10}
return roman_num_map[roman_num.upper()]
def get_word(soup):
'''
Get word, homonyms and homonym's number (if exist)
'''
word_content = []
for b_content in soup.b.contents:
if str(b_content) != '<b><u><mark> </mark></u></b>':
word_content_str = str(b_content).lower()
word_content_str = word_content_str.replace('\t', '')
word_content.append(word_content_str)
word = {}
word_str = None
variant = None
# 0 if no homonyms
homonym_num = 0
# Check structures of the word based on word content list length
if len(word_content) == 1:
word_str = word_content[0]
elif len(word_content) == 2:
if 'i' in word_content[1] or 'v' in word_content[1]:
word_str = word_content[0]
homonym_num = map_roman_nums(word_content[1])
elif '(' in word_content[1] and ')' in word_content[1]:
word_str = word_content[0]
variant = word_content[1][1:-1]
else:
word_str = ' '.join(word_content)
elif len(word_content) == 3:
if (('(' in word_content[1] and ')' in word_content[1])
and ('i' in word_content[2] or 'v' in word_content[2])):
word_str = word_content[0]
variant = word_content[1][1:-1]
homonym_num = map_roman_nums(word_content[2])
# TODO: add case "WORD (VARIANTS, ...)"" nb: variants often grammar words
# Filter prefixes
if word_str:
if (word_str[-1] == '-') or ('-)' in word_str):
word_str = None
word['word'] = word_str
word['variant'] = variant
word['homonym_num'] = homonym_num
return word
def get_meta(soup):
'''
Get meta - reference to the volume and page of original book
'''
for div_content in soup.div.contents:
if ('(' in div_content) and (')' in div_content):
s_pos = div_content.find('(') + 1
e_pos = div_content.find(')')
meta = div_content[s_pos:e_pos]
return meta
def clip_by_upper(str, first_skip = False):
'''
Clip string by first (or second) upper (capital) letter
'''
e_pos = len(str)
if not first_skip:
for index, char in enumerate(str):
if char.isupper():
e_pos = index
break
else:
first_flag = True
for index, char in enumerate(str):
if char.isupper() or char.isnumeric():
e_pos = index
if first_flag == False:
break
first_flag = False
clip_str = str[:e_pos].strip()
return clip_str
def clean_definition(definition):
'''
Clean definition by replacing abbrevaiations, removing
special symbols and stripping
'''
# Copying
definition_cleaned = definition
# Initial mapping
init_map = {'напр. ': 'например',
'-л.': '-либо',
'соотв.': 'соответственно',
'нек-р':'некотор',
'к-рый': 'который',
'к-рого':'которого',
'-н:':'-нибудь:',
'т. д.': 'так далее',
'т. п.': 'тому подобное',
'употр.': 'употребляется',
'букв.': 'буквально',
'преим.': 'преимущество',
'в знач.': 'в значении',
'с отриц. ф.': 'с отрицательной формой',
'в отриц. ф.': 'в отрицательной форме',
'кому-чему-либо': 'кому-, чему-либо',
'кем-чем-либо': 'кем-, чем-либо',
'кого-чего-либо': 'кого-, чего-либо',
'кого-что-либо': 'кого-, что-либо',
'‘':'"',
'’':'"',
' ': ':',
' :': ':',
';': ','}
for word, replacement in init_map.items():
definition_cleaned = definition_cleaned.replace(word, replacement)
# Strip
strip_map = ['"', '“', '[', '. ', ', ', '-']
for strip_substring in strip_map:
definition_cleaned = definition_cleaned.strip(strip_substring)
definition_cleaned = definition_cleaned.strip()
# Final mapping
fin_map = {' ,': ',',
' )': ')',
'<i>': '',
'</i>': ''}
for word, replacement in fin_map.items():
definition_cleaned = definition_cleaned.replace(word, replacement)
definition_cleaned = definition_cleaned.lower()
return definition_cleaned
def get_features(soup):
'''
Get word features (part of speech, definition)
'''
# Get feature content in <p> tag content and breaklines
# Define definition by Θ symbol
feature_content = []
next_isdef = False
definition = ''
for p_content in soup.p.contents:
p_content_str = str(p_content).strip()
if (p_content_str and p_content_str != '<br/>'
and '<br/>' not in p_content_str):
feature_content.append(p_content_str)
if next_isdef and not definition:
definition = p_content_str
if p_content_str == '<b> Θ </b>' and not definition:
next_isdef = True
# Output dictionary
features = {}
# PART OF SPEECH
# Define part of speech by first upper letter clipping
part_of_speech = clip_by_upper(''.join(feature_content))
# Remove tags
pattern = re.compile(r'<.*?>')
part_of_speech = re.sub(pattern, '', part_of_speech)
# Remove numbers
pattern = re.compile(r'[0-9][.]')
part_of_speech = re.sub(pattern, '', part_of_speech)
# Remove quotes and strip
part_of_speech = part_of_speech.replace('“','')
part_of_speech = part_of_speech.strip()
# Check is part of speech precisely specified and map it
# otherwise None
part_of_speech_map = {'аат.': 'noun',
'даҕ.': 'adj', # adjective
'сыһ.': 'adv', # adverb
'саҥа алл.': 'interj', # interjection
'туохт.': 'verb'}
wrong_part_of_speech = True
for filt_part_of_speech in part_of_speech_map.keys():
if part_of_speech == filt_part_of_speech:
wrong_part_of_speech = False
if not wrong_part_of_speech:
features['part_of_speech'] = part_of_speech_map[part_of_speech]
else:
features['part_of_speech'] = None
# DEFINITION
definition = clip_by_upper(definition, first_skip = True)
definition = clean_definition(definition)
# Check if definition not empty string and not have Sakha letters
# otherwise None
if definition:
sakha_letter_in_def_flag = False
for sakha_letter in ['Ҥ', 'ҥ', 'Ҕ' ,'ҕ' ,'Ө', 'ө', 'Һ', 'һ', 'Ү', 'ү']:
if sakha_letter in definition:
sakha_letter_in_def_flag = True
if not sakha_letter_in_def_flag:
features['definition'] = definition
else:
features['definition'] = None
else:
features['definition'] = None
return features
def parse_div(soup):
parsed = {}
word = get_word(soup)
features = get_features(soup)
if word['word'] and features['definition'] and features['part_of_speech']:
parsed['word'] = word['word']
parsed['variant'] = word['variant']
parsed['homonym_num'] = word['homonym_num']
parsed['part_of_speech'] = features['part_of_speech']
parsed['definition'] = features['definition']
return parsed
else:
return None
# Read html-file with only divs
with open('btsja_onlydivs.html', 'r', encoding = 'utf-8') as file:
lines = file.readlines()
# Parse and write to csv
with open('btsja_parsed_v01.csv','w',newline='',encoding='utf-8') as csv_file:
writer = csv.writer(csv_file, delimiter=',')
writer.writerow(['id', 'word', 'variant', 'homonym_num',
'part_of_speech', 'ru_definition'])
for i, line in enumerate(lines):
soup = BeautifulSoup(line, 'html.parser')
parsed = parse_div(soup)
if parsed != None:
to_write = []
to_write.append(str(i))
for parsed_item in parsed.values():
if parsed_item == None:
to_write.append('')
elif isinstance(parsed_item, int):
to_write.append(str(parsed_item))
else:
to_write.append(parsed_item)
writer.writerow(to_write)