-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathtry_stanza.py
221 lines (174 loc) · 7.39 KB
/
try_stanza.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""
Listing for practice with stanza module
0. Installation
Warning!
The following functions are NOT to be imported in your work.
Instead, use it as a reference for library API.
0. Installation
stanza is not a standard Python library: it is not pre-installed.
Make sure to specify library name and version in the requirements.txt file!
Make sure to install the library in your working environment!
"""
from pathlib import Path
from core_utils.pipeline import CoNLLUDocument, StanzaDocument
try:
import stanza
from stanza.models.common.doc import Document
from stanza.pipeline.core import Pipeline
from stanza.utils.conll import CoNLL
except ImportError:
print('No libraries installed. Failed to import.')
def load_model() -> Pipeline:
"""
1. Loading model
This is a necessary step for your work with lab 6.
Let's use stanza interface to do the following:
1. Download stanza model
2. Initialize a pipeline with downloaded model
Notice that, similar to spacy-udpipe, stanza models consist of
different pipelines such as tokenization, PoS tagging
and dependency parsing. For this reason, we specify
required analysis stages in the arguments.
Returns:
stanza.pipeline.core.Pipeline: Language model
"""
language = "ru"
processors = "tokenize,pos,lemma,depparse"
stanza.download(lang=language, processors=processors, logging_level="INFO")
model = Pipeline(
lang=language,
processors=processors,
logging_level="INFO",
download_method=None
)
return model
def analyze_text(model: Pipeline, text: str) -> Document | list[StanzaDocument]:
"""
2. Extracting the features from the text
This is a necessary step for your work with lab 6.
Stanza's interface for analyzing the text is a little different:
1. `process` method accepts a Document instance (or a list of such instances),
containing a text to be processed
2. Document instance is initialized with the following arguments:
- list of sentences in the CoNLL-U format,
if available (in our case, leave it empty)
- original text
The returned instance already contains information about lemmas,
morphological features, PoS etc., but not in the CoNNL-U format
Args:
model (stanza.pipeline.core.Pipeline): Language model
text (str): text to analyze via UDPipe model
Returns:
stanza.models.common.doc.Document: annotated text
"""
analyzed = model.process(Document([], text=text))
return analyzed
def export_conllu_annotation(document: Document, path: Path) -> None:
"""
3. Save extracted features to CoNLL-U file
This is a necessary step for your work with lab 6.
Stanza allows for direct dump of extracted information to the .conllu file.
For this, we use `write_doc2conll` method from
stanza.utils.conll.CoNLL module.
It accepts Document instance with analyzed text and
a path where the file should be saved.
Args:
document (stanza.models.common.doc.Document): Analyzed text
path (str): path to the resulting file with CoNLL-U annotation
"""
CoNLL.write_doc2conll(
doc=document,
filename=path,
)
def import_conllu_annotation(path: Path) -> Document | CoNLLUDocument:
"""
4. Import Document with text description from CoNLL-U file
This is a necessary step for your work with lab 6.
Conversely, stanza support creation of Document from .conllu files.
For this, we use `conll2doc` method from
stanza.utils.conll.CoNLL module.
It accepts a path to a CoNLL-U file.
Args:
path (str): path to the resulting file with CoNLL-U annotation
Returns:
stanza.models.common.doc.Document: Analyzed text
"""
return CoNLL.conll2doc(input_file=path)
def extract_linguistic_feature(document: Document, feature: str) -> list[list[int | str]]:
"""
5. Extract annotation for each word individually
This is a necessary step for your work with lab 6.
Stanza allows for access for each CoNLL-U markup feature for each word individually.
This can help you perform qualitative analysis over text
(e.g. calculating statistics of a particular feature occurrence
or exploring grammar structure of a sentence).
Structure of the analyzed text is organized as follows:
stanza.Document: [
stanza.Sentence : [stanza.Word, stanza.Word, ...]
stanza.Sentence : [stanza.Word, stanza.Word, ...]
...]
So, in order to access information about each particular word,
one has to firstly iterate over `Document.sentences` attribute.
It contains a list of `stanza.Sentence` objects.
Each `stanza.Sentence` object, it its turn, has `Sentence.words`
attribute, which contains a list of `stanza.Word` objects.
Finally, `stanza.Word` contains all features used for
CoNLL-U annotation: id, lemma, upos, head, deprel, misc etc.
To conveniently access each feature, convert `stanza.Word`
to Python dictionary with `.to_dict()` method.
Sample resulting dictionary:
{
"id": 2,
"text": "люблю",
"lemma": "любить",
"upos": "VERB",
"feats": "Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act",
"head": 0,
"deprel": "root",
"misc": "",
"start_char": 10,
"end_char": 15
}
Args:
document (stanza.models.common.doc.Document): Analyzed text
feature (str): Name of CoNLL-U feature to extract
Returns:
list[list[int | str]]: Extracted features
"""
sentences_features = []
for conllu_sentence in document.sentences:
sentence_features = []
for word in conllu_sentence.words:
word_feature = word.to_dict()[feature]
sentence_features.append(word_feature)
sentences_features.append(sentence_features)
return sentences_features
def main() -> None:
"""
Entrypoint for a seminar's listing
"""
# 1. Download and initialize Stanza model
stanza_model = load_model()
assert isinstance(stanza_model, stanza.pipeline.core.Pipeline)
# 2. Extract linguistic features
text_to_analyze = "Привет! Я люблю программировать."
analyzed_text = analyze_text(stanza_model, text_to_analyze)
assert isinstance(analyzed_text, Document)
print(analyzed_text)
# 3. Save extracted features to .conllu file
conllu_file_path = Path('analyzed_text.conllu')
export_conllu_annotation(analyzed_text, conllu_file_path)
assert conllu_file_path.exists()
# 4. Load text and its properties from .conllu file
loaded_analyzed_text = import_conllu_annotation(conllu_file_path)
assert isinstance(loaded_analyzed_text, Document)
print(loaded_analyzed_text)
# 5. Extract particular features from each word in each sentence
upos_information = extract_linguistic_feature(loaded_analyzed_text, "upos")
assert upos_information == [['NOUN', 'PUNCT'], ['PRON', 'VERB', 'VERB', 'PUNCT']]
deprel_information = extract_linguistic_feature(loaded_analyzed_text, "deprel")
assert deprel_information == [['root', 'punct'], ['nsubj', 'root', 'xcomp', 'punct']]
head_information = extract_linguistic_feature(loaded_analyzed_text, "head")
assert head_information == [[0, 1], [2, 0, 2, 2]]
if __name__ == "__main__":
main()