-
Notifications
You must be signed in to change notification settings - Fork 2
/
basic_to_squad.py
319 lines (270 loc) · 11 KB
/
basic_to_squad.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
"""Convert a simple JSON dataset into SQuAD format."""
from typing import Dict, List, Optional
from transformers import T5Tokenizer
import numpy.random as nr
from information_extraction_t5.features.context import get_context
from information_extraction_t5.features.questions.type_map import TYPENAME_TO_TYPE
from information_extraction_t5.features.preprocess import get_questions_for_chunk
WARNING_MISSING_TYPENAMES = []
def get_question_answers(document: Dict[str, str],
questions: Optional[List[str]] = None,
qa_id: str = 'publicacoes.instancia',
choose_question: str = 'first'):
"""Gets question-answers in SQUAD format for the specified type name.
The answers encompass only the canonical response (value).
The size of the list is:
- zero: if there's no question-answer with the specified type name or if the
corresponding value of a type name key in the document is not a string.
- one: choose_question is 'first' or 'random'.
- N: an element for each question passed to the questions parameter.
Returns:
List of dictionaries where each element is a question and its answers.
"""
if questions is None:
questions = []
subanswer = document
qa_id_split = qa_id.split('.')
for type_name in qa_id_split[1:]:
subanswer = subanswer[type_name]
# select questions
if choose_question == 'first':
selected_questions = [questions[0]]
elif choose_question == 'random':
idx = nr.randint(len(questions))
selected_questions = [questions[idx]]
else:
selected_questions = questions
qas = []
answer = f"[{TYPENAME_TO_TYPE[type_name]}]: {subanswer}"
for question in selected_questions:
answers = [
{
"answer_start": -1, # None,
"text": answer,
}
]
qa = {
"answers": answers,
"question": question,
"id": qa_id,
}
qas.append(qa)
return qas
def get_compound_question_answers(
document: Dict[str, str],
questions: Optional[List[str]] = None,
qa_id: str = 'publicacoes.instancia_orgao_tipo',
choose_question: str = 'first') -> List[Dict]:
"""Gets question-answers in SQUAD format for the specified type names.
The answers encompass only the canonical response (value).
The size of the list is:
- zero: if there's no question-answer with the specified type name or if the
corresponding value of a type name key in the document is not a string.
- one: choose_question is 'first' or 'random'.
- N: an element for each question passed to the questions parameter.
Returns:
List of dictionaries where each element is a question and its answers.
"""
# select questions
if questions is None:
questions = []
if choose_question == 'first':
selected_questions = [questions[0]]
elif choose_question == 'random':
idx = nr.randint(len(questions))
selected_questions = [questions[idx]]
else:
selected_questions = questions
type_name = qa_id.split('.')[1]
all_type_names = get_questions_for_chunk(qa_id=qa_id, return_dict=True).copy()
for tn in all_type_names.keys():
if tn == 'compound':
continue
all_type_names[tn] = f'[{TYPENAME_TO_TYPE[tn]}]: N/A'
if 'compound' in all_type_names.keys():
all_type_names.pop('compound')
# preparing the compound answer
for tn in document[type_name].keys():
type = TYPENAME_TO_TYPE[tn]
subanswer = document[type_name][tn]
if tn in all_type_names.keys():
all_type_names[tn] = f"[{type}]: {subanswer}"
elif not tn in WARNING_MISSING_TYPENAMES:
print(f'WARNING: type-name {tn} is not in question signature for {type_name}: please add it in the OrderedDict if you want to keep.')
WARNING_MISSING_TYPENAMES.append(tn)
answer = ' '.join(all_type_names.values())
qas = []
for question in selected_questions:
answers = [
{
"answer_start": -1, # None,
"text": answer
}
]
qa = {
"answers": answers,
"question": question,
"id": qa_id,
}
qas.append(qa)
return qas
def get_notapplicable_question_answers(
qa_id: str = 'matriculas.endereco',
choose_question: str = 'first',
list_of_use_compound_question: Optional[List[str]] = None):
"""
Return a list of question-answers in SQUAD format for non-annotated
type-names.
The size of the list is:
- one (choose_question as 'first' or 'random')
- the number of questions defined as 'compound' for the current chunk
returned by get_questions_for_chunk(chunk) (choose_question as 'all')
"""
if list_of_use_compound_question is None:
list_of_use_compound_question = []
is_compound = qa_id in list_of_use_compound_question
questions = get_questions_for_chunk(qa_id=qa_id, is_compound=is_compound)
if questions is None:
questions = []
if choose_question == 'first':
selected_questions = [questions[0]]
elif choose_question == 'random':
idx = nr.randint(len(questions))
selected_questions = [questions[idx]]
else:
selected_questions = questions
if is_compound:
# type_name = qa_id.split('.')[1]
all_type_names = get_questions_for_chunk(qa_id=qa_id, return_dict=True).copy()
for tn in all_type_names.keys():
if tn == 'compound':
continue
all_type_names[tn] = f'[{TYPENAME_TO_TYPE[tn]}]: N/A'
if 'compound' in all_type_names.keys():
all_type_names.pop('compound')
answer = ' '.join(all_type_names.values())
else:
type_name = qa_id.split('.', 1)[1]
type = TYPENAME_TO_TYPE[type_name]
answer = f"[{type}]: N/A"
qas = []
for question in selected_questions:
answers = [
{
"answer_start": -1, # None,
"text": answer
}
]
qa = {
"answers": answers,
"question": question,
"id": qa_id,
}
qas.append(qa)
return qas
def get_document_data(document: Dict,
document_type: str = 'publicacoes',
all_qa_ids: List[str] = ['publicacoes.orgao'],
max_size: int = 4000,
list_of_use_compound_question: Optional[List[str]] = None,
list_of_type_names: Optional[List[str]] = None,
context_content: str = 'abertura',
window_overlap: float = 0.5,
max_windows: int = 3,
tokenizer: T5Tokenizer = None,
max_tokens: int = 512,
choose_question: str = 'first',
use_sentence_id: bool = False):
# using the document uuid as title
# paragraphs will contain only one dict with context of document and all the
# question-answers
if list_of_type_names is None:
list_of_type_names = []
if list_of_use_compound_question is None:
list_of_use_compound_question = []
# assuming that this is the largest question
largest_question = 'Quais são as principais informações do documento de publicação?'
# create dummy document
dummy_document = {}
dummy_document['text'] = document['text'] if 'text' in document.keys() else document['texto']
dummy_document['uuid'] = document['uuid']
# exclude crazy chars
dummy_document['text'] = dummy_document['text'].replace('༡༨/༢','')
# extract the context(s) and respective offset(s)
contexts, offsets = get_context(
dummy_document,
context_content=context_content,
max_size=max_size,
start_position=0,
proportion_before=0.2,
return_position_offset=True,
use_sentence_id=use_sentence_id,
tokenizer=tokenizer,
max_tokens=max_tokens,
question=largest_question,
window_overlap=window_overlap,
max_windows=max_windows)
if not isinstance(contexts, list):
contexts = [contexts]
offsets = [offsets]
# document structure in SQuAD format
document_data = {
"title": document['uuid'],
"paragraphs": []
}
counter_qas = 0
for context, _ in zip(contexts, offsets):
# create one paragraph for each context.
# it will be unique, except for windows-based context_contents
paragraph = {
"context": context,
"qas": [],
}
paragraph_counter_qas = 0
# control which of the requested qa_ids were satified. It force not-applicable
# qas for qa_ids whose information does not exist in the dataset.
all_qa_ids_satisfied = []
# We will use only the fields listed in list_of_type_names
for qa_id in list_of_type_names:
doc_type = qa_id.split('.')[0]
if doc_type != document_type:
continue
if qa_id in list_of_use_compound_question:
questions = get_questions_for_chunk(qa_id=qa_id, is_compound=True)
qas = get_compound_question_answers(
document,
questions=questions,
qa_id=qa_id,
choose_question=choose_question)
else:
questions = get_questions_for_chunk(qa_id=qa_id)
qas = get_question_answers(document,
questions=questions,
qa_id=qa_id,
choose_question=choose_question)
paragraph_counter_qas += len(qas)
# Include the question-answer of the current type_name (e.g., tipo)
# in the current paragraph of the current document
for qa in qas:
paragraph["qas"].append(qa)
all_qa_ids_satisfied.append(qa_id)
# extract not-applicable qas for non-existent information.
add_not_applicable = sorted(
list(set(all_qa_ids) - set(all_qa_ids_satisfied))
)
for qa_id in add_not_applicable:
qas = get_notapplicable_question_answers(
qa_id=qa_id,
choose_question='first', # avoid using too much negatives
list_of_use_compound_question=list_of_use_compound_question)
paragraph_counter_qas += len(qas)
# Include the not-applicable question-answer in the current
# paragraph of the current document
for qa in qas:
paragraph["qas"].append(qa)
all_qa_ids_satisfied.append(qa_id)
# Add the current paragraph in the structure
if paragraph_counter_qas > 0:
document_data["paragraphs"].append(paragraph)
counter_qas += paragraph_counter_qas
return document_data, counter_qas