-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathloader.py
executable file
·308 lines (276 loc) · 11.3 KB
/
loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Author: Gözde Gül Şahin
Load (morphed) conll09 dataset, create vocabularies
Calls TextLoader to get encodings of word-parts
Pack all information into a Dataset
"""
from itertools import chain
from IO.conll09 import *
from IO.conllud_fin import *
from SUB.wordpart import *
from dataset import *
use_cuda = torch.cuda.is_available()
dtype = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
def wrap(mylist):
return torch.FloatTensor(mylist)
def to_lower(tokenLst):
lowerTokLst = []
for w in tokenLst:
lowerTokLst.append(w.lower())
return lowerTokLst
def to_float(mask):
float_mask = []
for reg in mask:
float_mask.append(float(reg))
return float_mask
def prepare_sequence(seq, to_ix):
"""
return UNK index (1) if word not found
:param seq: sequence
:param to_ix: dictionary
:return: torch tensor of indices
"""
#idxs = map(lambda w: to_ix[w] if (w in to_ix) else constants.UNK, seq)
idxs = []
for w in seq:
if (w in to_ix):
idxs.append(to_ix[w])
else:
idxs.append(constants.UNK)
tensor = torch.LongTensor(idxs)
return tensor
def prepare_label_sequence(seq, to_ix, modeltype="simple"):
"""
Put start and stop tag for delimitation
If it is a sequence model, we put a start and stop tag, otherwise there will be nothing
:param seq: sequence
:param to_ix: dictionary
:param modeltype: simple or sequence
:return:
"""
idxs = []
if modeltype=="seq":
idxs.append(to_ix[constants.START_TAG])
for l in seq:
if (modeltype == "seq")|(modeltype == "simple"):
idxs.append(to_ix[l]) if(l in to_ix) else idxs.append(constants.UNK)
else:
idxs.append(0) if (l=="_") else idxs.append(1)
if modeltype=="seq":
idxs.append(to_ix[constants.STOP_TAG])
tensor = torch.LongTensor(idxs)
return tensor
class Loader():
def __init__(self, opt, train_file=None, dev_file=None, test_file=None, save_dir=None, train=True, test=False, w2i=None, r2i=None):
"""
:param opt: argument labeling options
:param train_file: training file
:param dev_file: When dev_file is None, it means we work on cross-validation mode
:param test_file: test file (in test mode)
:param save_dir: directory to save vocabularies
:param train: True if in training mode
:param test: True if in testing mode
:param w2i: None or word vocabulary from pretrained word vectors
:param r2i: None if train, else from a previous loader
"""
# word lookup table
self.word_to_ix = w2i if (w2i != None) else {}
# semantic role lookup table
self.role_to_ix = r2i if (r2i != None) else {}
self.options = opt
self.c9sents = None
self.train = train
self.test = test
self.ft_embeds = (opt.w2vtype == 'fasttext')
self.role_vocab_file = os.path.join(save_dir, "role_vocab.pkl")
self.word_vocab_file = os.path.join(save_dir, "words_vocab.pkl")
# load conll09 file, init lookups, prepare sorted data
if(train==True):
opt.trainLst = self.prepare_input_for_subloader(train_file, opt.unit)
self.subloader = TextLoader(opt, train=train)
self.build_vocab(train_file)
self.train_data = self.process(train_file)
if dev_file != None:
self.dev_data = self.process(dev_file)
elif (test==True):
self.subloader = TextLoader(opt, train=False)
self.load_preprocessed()
if test_file != None:
self.test_data = self.process(test_file)
if train_file != None:
self.train_data = self.process(train_file)
if dev_file != None:
self.dev_data = self.process(dev_file)
def load_preprocessed(self):
with open(self.role_vocab_file, 'rb') as f:
self.role_to_ix = pickle.load(f)
with open(self.word_vocab_file, 'rb') as f:
self.word_to_ix = pickle.load(f)
def build_vocab(self, filepath):
"""
Builds role and word vocabularies
:param filepath: path to training file
:return:
"""
# fall back to subloader word_to_id if no pretrained embeddings are given
if len(self.word_to_ix)==0:
self.word_to_ix = self.subloader.word_to_id
# save vocabulary under base folder
with open(self.word_vocab_file, 'wb') as f:
pickle.dump((self.word_to_ix), f)
self.role_to_ix[constants.PAD_ROLE] = len(self.role_to_ix)
self.role_to_ix[constants.UNK_ROLE] = len(self.role_to_ix)
if self.c9sents == None:
if self.options.lang=="fin":
c9reader = conllud_fin(filepath)
else:
c9reader = conll09(filepath, self.options.use_predicted)
self.c9sents = c9reader.sents
# for each sentence
for c9sent in self.c9sents:
for i in range(c9sent.predcnt):
# role labels
sroles = c9sent.labels[i]
for role in sroles:
if role not in self.role_to_ix:
self.role_to_ix[role] = len(self.role_to_ix)
with open(self.role_vocab_file, 'wb') as f:
pickle.dump((self.role_to_ix), f)
def prepare_input_for_subloader(self, filepath, unit):
"""
if unit is oracle , make a list from token_oracles, otherwise use token_words
:param filepath: path to training file
:param unit: subword unit
:return:
"""
sentLst = []
if self.c9sents == None:
if self.options.lang=="fin":
c9reader = conllud_fin(filepath)
else:
c9reader = conll09(filepath, self.options.use_predicted)
self.c9sents = c9reader.sents
for c9sent in self.c9sents:
if unit == "oracle":
tokenLst = c9sent.tokenOracles
else:
tokenLst = c9sent.tokenWords
if(len(tokenLst) <= self.options.max_seq_length):
#tokenLst = map(lambda w: w.lower(), tokenLst)
tokenLst = to_lower(tokenLst)
sentLst.append(tokenLst)
return sentLst
def process(self, filepath):
"""
Make a sorted list of training data a.t. their sentence length
Packed info: word, morphological representation, predicate flag, predicate word, semantic roles
:param filepath: path to training file
:return: packed training data
"""
if self.options.lang == "fin":
c9reader = conllud_fin(filepath)
else:
c9reader = conll09(filepath, self.options.use_predicted)
c9sents = c9reader.sents
# for sorting purposes
sentBucket = {}
# if in evaluation mode do not sort data, there will be no batching
if self.test:
unsorted_data = []
self.c9sents = c9sents
# for each sentence
for c9sent in c9sents:
# tokens
sentTokenLst = c9sent.tokenWords
# oracles
sentOracleLst = c9sent.tokenOracles
if(len(sentTokenLst) <= self.options.max_seq_length):
#sentTokenLst = map(lambda w: w.lower(), sentTokenLst)
sentTokenLst = to_lower(sentTokenLst)
#sentOracleLst = map(lambda w: w.lower(), sentOracleLst)
sentOracleLst = to_lower(sentOracleLst)
numWords = len(sentTokenLst)
# for each predicate
for i in range(c9sent.predcnt):
predInd = c9sent.predind[i]
genData = []
# 1) tokens
genData.append(sentTokenLst)
# 2) oracles
genData.append(sentOracleLst)
# 3) binary flag for predicate
binmask = [0] * numWords
binmask[predInd] = 1
genData.append(binmask)
# 4) predicate word
predWord = sentTokenLst[predInd].lower().split()
genData.append(predWord)
# 5) gold semantic roles
sroles = c9sent.labels[i]
genData.append(sroles)
### sorting/batching
if numWords in sentBucket:
sentBucket[numWords].append(genData)
else:
sentBucket[numWords]=[genData]
if self.test:
unsorted_data.append(genData)
sorted_data = list(chain.from_iterable(sentBucket.values()))
return sorted_data if self.train else unsorted_data
def getData(self, raw_data, train=True, modeltype="simple"):
"""
:param raw_data: packed training data by process function
:param train: True if in training mode
:param modeltype: can be ignored - will be simple
:return: dataset object - ready to batch
"""
dataTok = []
dataSub = []
dataBM = []
dataPW = []
dataRole = []
unkWords = []
for sentence, morph_anal, bmSeq, predWord, roles in raw_data:
#sentence = map(lambda w:w.lower(),sentence)
sentence = to_lower(sentence)
for w in sentence:
if w not in self.word_to_ix:
unkWords.append(w)
# predicate to lower
predWord[0] = predWord[0].lower()
# 1) token
# if fasttext, pass the words itself
if self.ft_embeds:
dataTok.append(sentence)
else:
word_embed_ind = prepare_sequence(sentence, self.word_to_ix)
dataTok.append(word_embed_ind)
# 2) subword of your choice (comes already padded)
if self.subloader.unit == "oracle":
sub_encoded = self.subloader.encode_data(morph_anal)
else:
sub_encoded = self.subloader.encode_data(sentence)
dataSub.append(torch.LongTensor(sub_encoded).type(self.options.otype))
# 3) binary mask
bmSeq = to_float(bmSeq)
word_bin_feat = wrap(bmSeq)
dataBM.append(word_bin_feat)
# 4) predicate word
word_pred_ind = prepare_sequence(predWord, self.word_to_ix)
dataPW.append(word_pred_ind)
# 5) semantic roles
training_labels = prepare_label_sequence(roles, self.role_to_ix, modeltype)
dataRole.append(training_labels)
# If in evaluation mode, do not keep the history (volatile)
# When the model type is simple, we pad the label sequence with zero
if train:
dset = Dataset([dataTok, dataSub, dataBM, dataPW], [dataRole],
self.options.batch_size, use_cuda, self.ft_embeds)
else:
dset = Dataset([dataTok, dataSub, dataBM, dataPW], [dataRole],
self.options.batch_size, use_cuda, self.ft_embeds, volatile=True)
del dataTok, dataBM, dataPW, dataRole
print("Number of unknown words: %d\n" %len(unkWords))
return dset