forked from fipl-hse/2022-2-level-ctlr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle.py
183 lines (149 loc) · 5 KB
/
article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""
Article implementation
"""
import enum
import re
from datetime import datetime
from pathlib import Path
from typing import Optional, Protocol, Sequence
from core_utils.constants import ASSETS_PATH
def date_from_meta(date_txt: str) -> datetime:
"""
Converts text date to datetime object
"""
return datetime.strptime(date_txt, "%Y-%m-%d %H:%M:%S")
def get_article_id_from_filepath(path: Path) -> int:
"""
Extracts the article id from its path
"""
return int(path.stem.split('_')[0])
def split_by_sentence(text: str) -> list[str]:
"""
Splits the given text by sentence separators
"""
pattern = r"(?<!\w\.\w.)(?<![А-Я][а-я]\.)((?<=\.|\?|!)|(?<=\?\"|!\"))\s(?=[А-Я])"
text = re.sub(r'[\n|\t]+', '. ', text)
sentences = [sentence for sentence in re.split(pattern, text) if sentence.replace(' ', '')
and len(sentence) > 10]
return sentences
# pylint: disable=too-few-public-methods
class SentenceProtocol(Protocol):
"""
Protocol definition for sentences to make dependency inversion from direct
import from lab 6 implementation of ConlluSentence
"""
def get_cleaned_sentence(self) -> str:
"""
All tokens should be normalized and joined with a space
"""
def get_tokens(self) -> list:
"""
All tokens should be ConlluToken instance
"""
def get_conllu_text(self, include_morphological_tags: bool) -> str:
"""
Gets the text in the CONLL-U format
"""
class ArtifactType(enum.Enum):
"""
Types of artifacts that can be created by text processing pipelines
"""
CLEANED = 'cleaned'
MORPHOLOGICAL_CONLLU = 'morphological_conllu'
POS_CONLLU = 'pos_conllu'
FULL_CONLLU = 'full_conllu'
class Article:
"""
Article class implementation.
Stores article raw, meta and conllu data
"""
date: Optional[datetime]
_conllu_sentences: Sequence[SentenceProtocol]
def __init__(self, url: Optional[str], article_id: int) -> None:
self.url = url
self.article_id = article_id
self.title = ''
self.date = None
self.author = []
self.topics = []
self.text = ''
self.pos_frequencies = {}
self._conllu_sentences = []
def set_pos_info(self, pos_freq: dict) -> None:
"""
Sets POS frequencies attribute
"""
self.pos_frequencies = pos_freq
def get_meta(self) -> dict:
"""
Gets all meta params
"""
return {
'id': self.article_id,
'url': self.url,
'title': self.title,
'date': self._date_to_text() or None,
'author': self.author,
'topics': self.topics,
'pos_frequencies': self.pos_frequencies
}
def get_raw_text(self) -> str:
"""
Gets raw text from the article
"""
return self.text
def get_conllu_text(self, include_morphological_tags: bool) -> str:
"""
Gets the text in the CONLL-U format
"""
return '\n'.join([sentence.get_conllu_text(include_morphological_tags) for sentence in
self._conllu_sentences]) + '\n'
def set_conllu_sentences(self, sentences: Sequence[SentenceProtocol]) -> None:
"""
Sets the conllu_sentences_attribute
"""
self._conllu_sentences = sentences
def get_conllu_sentences(self) -> Sequence[SentenceProtocol]:
"""
Returns the sentences from ConlluArticle
"""
return self._conllu_sentences
def get_cleaned_text(self) -> str:
"""
Returns the cleaned text
"""
return ' '.join([sentence.get_cleaned_sentence() for
sentence in self._conllu_sentences])
def _date_to_text(self) -> str:
"""
Converts datetime object to text
"""
return self.date.strftime("%Y-%m-%d %H:%M:%S") if self.date else ''
def get_raw_text_path(self) -> Path:
"""
Returns path for requested raw article
"""
article_txt_name = f"{self.article_id}_raw.txt"
return ASSETS_PATH / article_txt_name
def get_meta_file_path(self) -> Path:
"""
Returns path for requested article
"""
meta_file_name = f"{self.article_id}_meta.json"
return ASSETS_PATH / meta_file_name
def get_file_path(self, kind: ArtifactType) -> Path:
"""
Returns a proper filepath for an Article instance
kind: variant of a file -- ArtifactType
"""
conllu = kind in (ArtifactType.POS_CONLLU,
ArtifactType.MORPHOLOGICAL_CONLLU,
ArtifactType.FULL_CONLLU)
extension = '.conllu' if conllu else '.txt'
article_name = f"{self.article_id}_{kind.value}{extension}"
return ASSETS_PATH / article_name
def get_pos_freq(self) -> dict:
"""
Returns a pos_frequency parameter
"""
return self.pos_frequencies