-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
76 lines (54 loc) · 1.96 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gzip
import requests
import re
from os import path
from xml.etree.ElementTree import Element, tostring
from tempfile import TemporaryDirectory
from python.odict import Dictionary
url = "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz"
with TemporaryDirectory() as dirpath:
try:
file_name = url.split('/')[-1]
blob = requests.get(url).content
output_path = path.join(dirpath, file_name)
new_file = open(output_path, 'w+b')
new_file.write(blob)
new_file.close()
g = gzip.open(output_path, 'rb')
root = Element('dictionary')
root.attrib["name"] = "CC-CEDICT"
entries = {}
while True:
line = g.readline().decode('utf-8')
if not line:
break
m = re.match("(.*?)\s(.*?)\s\[(.*?)]\s/(.*)/", line)
if m is not None:
traditional = m.group(1)
simplified = m.group(2)
pronunciation = m.group(3)
definitions = m.group(4).split('/')
print("Processing word %s..." % simplified)
entry = entries.get(simplified, Element(
"entry",
attrib={
'term': simplified,
'pronunciation': pronunciation,
}
))
ety = Element("ety")
usage = Element("usage")
for deff in definitions:
d = Element("definition")
d.text = deff
usage.append(d)
ety.append(usage)
entry.append(ety)
entries[simplified] = entry
g.close()
print("Writing to \"cedict.odict\"...")
[root.append(e) for e in entries.values()]
xml = tostring(root).decode('utf-8')
Dictionary.write(xml, "cedict.odict")
except Exception as e:
print(e)