-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathtry_udpipe.py
183 lines (138 loc) · 5.69 KB
/
try_udpipe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Listing for practice with spacy-udpipe module
Warning!
The following functions are NOT to be imported in your work.
Instead, use it as a reference for library API.
0. Installation
spacy-udpipe is not a standard Python library: it is not pre-installed.
Make sure to specify library name and version in the requirements.txt file!
Make sure to install the library in your working environment!
"""
from pathlib import Path
try:
import spacy
import spacy_udpipe
except ImportError:
print('No libraries installed. Failed to import.')
from core_utils.constants import UDPIPE_MODEL_PATH
from core_utils.pipeline import AbstractCoNLLUAnalyzer
def load_model(model_path: Path) -> spacy.Language | AbstractCoNLLUAnalyzer:
"""
1. Loading model
This is a necessary step for your work with lab 6.
Let's use spacy-udpipe interface to read the pre-downloaded UDPipe model.
Find the path to the model and pass it as an argument.
NOTE:
`spacy_udpipe.load_from_path` accepts 2 arguments:
1. language specification (for us, it is `ru` for the Russian language)
2. path to the model as a string
Args:
model_path (Path): Path to pre-downloaded UDPipe model
Returns:
spacy.Language: Language model
"""
model = spacy_udpipe.load_from_path(
lang="ru",
path=str(model_path)
)
return model
def explore_model(model: spacy.Language) -> dict | None:
"""
2. Exploring model
This is NOT a necessary step for your work with lab 6.
It helps us understand the model while we learn the framework.
Spacy Model comprises various pipelines e.g. tagging, lemmatization etc.
Let's check which pipelines are added to the model.
Args:
model (spacy.Language): Language model
Returns:
dict: Language model pipelines summary
"""
return model.analyze_pipes()
def enable_conllu_formatting(model: spacy.Language) -> spacy.Language:
"""
3. Adding CoNLL-U formatter
This is a necessary step for your work with lab 6.
To be able to produce CoNNL-U formatted annotations,
we need to add a special pipeline to the Spacy model.
For this, we use `add_pipe` model method, which accepts:
1. pipe name (in our case, `conll_formatter`)
2. configuration
In configuration, we want to specify the following:
- conversion map for XPOS feature: for this model, XPOS tags are not specified, so
we want to convert empty string "" to the appropriate missing value symbol "_"
- whether resulting ConLL-U files should include headers with sentence number and text:
for your work, it is required that annotated files contain headers, so we specify
this argument to be `True`
Args:
model (spacy.Language): Language model
Returns:
spacy.Language: Language model with added UD pipe
"""
model.add_pipe(
"conll_formatter",
last=True,
config={"conversion_maps": {"XPOS": {"": "_"}}, "include_headers": True},
)
return model
def annotate_text(model: spacy.Language, text: str) -> str:
"""
4. Annotating text with CoNLL-U format
This is a necessary step for your work with lab 6.
In order to produce a string with CoNNL-U annotation, we need to
1. Analyze text via UDPipe model via call method
2. Extract formatted string from the special attribute
Args:
model (spacy.Language): Language model
text (str): text to analyze via UDPipe model
Returns:
str: CoNLL-U annotation
"""
analyzed_text = model(text)
conllu_annotation = analyzed_text._.conll_str
return str(conllu_annotation)
def export_conllu_annotation(annotation: str, path: Path) -> None:
"""
5. Save extracted features to CoNLL-U file
This is a necessary step for your work with lab 6.
`spacy-udpipe` model interface does not permit direct saving
of extracted linguistic features to a .conllu file.
For this reason, production of such files is to be done manually.
Create a file with a specified path and write extracted
string with CoNLL-U annotation to this file.
Make sure to add additional newline!
Args:
annotation (str): CoNLL-U annotation of the text
path (str): Path to the resulting file with CoNLL-U annotation
"""
with open(path, 'w', encoding='utf-8') as annotation_file:
annotation_file.write(annotation)
annotation_file.write("\n")
def main() -> None:
"""
Entrypoint for a seminar's listing
"""
# 1. Read the UDPipe model
# It is pre-downloaded for you from https://universaldependencies.org/
udpipe_model = load_model(UDPIPE_MODEL_PATH)
assert isinstance(udpipe_model, spacy.Language)
# 2. Explore the loaded UDPipe model and explain
# whether it is ready to perform CoNLL-U annotation
model_summary = explore_model(udpipe_model)
print(model_summary)
# 3. Add CoNLL-U formatting pipeline to the model
enable_conllu_formatting(udpipe_model)
model_summary = explore_model(udpipe_model)
print(model_summary)
assert isinstance(model_summary, dict)
assert 'conll_formatter' in model_summary['summary']
# 4. Annotate text using CoNNL-U format
text_to_analyze = "Привет! Я люблю программировать."
annotation = annotate_text(udpipe_model, text_to_analyze)
print(annotation)
assert 'Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Ac' in annotation
# 5. Write extracted CoNLL-U annotation to the file
conllu_file_path = Path('analyzed_text.conllu')
export_conllu_annotation(annotation, conllu_file_path)
if __name__ == "__main__":
main()