-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbert_clc_concatevi.py
268 lines (189 loc) · 10 KB
/
bert_clc_concatevi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
# This file contains the code for the Claim Label Classification task: Data -> Training -> Classification Model
import json
import pandas as pd
import torch
import random
import torch.nn as nn
from transformers import BertTokenizer
from transformers import BertModel
from collections import Counter, defaultdict
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import time
from torch.nn import functional as F
from main import path_prefix
clc_model_params_filename = path_prefix + 'cfeverlabelcls.dat'
# ----------Hyperparameters of the entire pipeline----------
# --------------Claim Label Classification--------------
d_bert_base = 768
gpu = 0
input_seq_max_len = 512
loader_batch_size = 16
loader_worker_num = 2
num_epoch = 10
max_evi_num = 5
num_of_classes = 4
opti_lr_clc = 2e-5
label_mapper_ltoi = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
label_mapper_itol = {0: 'SUPPORTS', 1: 'REFUTES', 2: 'NOT_ENOUGH_INFO', 3: 'DISPUTED'}
# ------------------------------------------------------
class CFEVERLabelTrainDataset(Dataset):
"""Climate Fact Extraction and Verification Dataset for Training, for the Evidence Retrival task."""
def __init__(self, claims, evidences_, max_len=input_seq_max_len):
self.data_set = [claims[c] for c in claims]
self.max_len = max_len
self.claims = claims
self.evidences = evidences_
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def __len__(self):
return len(self.data_set)
def __getitem__(self, index):
claim = self.data_set[index]
claim_evidences = claim['evidences']
random.shuffle(claim_evidences)
evidences_combined = " ".join([self.evidences[eid] for eid in claim_evidences])
# Preprocessing the text to be suitable for BERT
claim_evidence_in_tokens = self.tokenizer.encode_plus(claim['claim_text'], evidences_combined,
return_tensors='pt', padding='max_length', truncation=True,
max_length=self.max_len, return_token_type_ids=True)
seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
return seq, attn_masks, segment_ids, label_mapper_ltoi[claim['claim_label']]
class CFEVERLabelTestDataset(Dataset):
"""Climate Fact Extraction and Verification Dataset for Testing, for the Evidence Retrival task."""
def __init__(self, claims, evidences_, max_len=input_seq_max_len):
self.data_set = [(c, claims[c]) for c in claims]
self.max_len = max_len
self.claims = claims
self.evidences = evidences_
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def __len__(self):
return len(self.data_set)
def __getitem__(self, index):
claim_id, claim = self.data_set[index]
claim_evidences = claim['evidences']
# random.shuffle(claim_evidences)
evidences_combined = " ".join([self.evidences[eid] for eid in claim_evidences])
# Preprocessing the text to be suitable for BERT
claim_evidence_in_tokens = self.tokenizer.encode_plus(claim['claim_text'], evidences_combined,
return_tensors='pt', padding='max_length', truncation=True,
max_length=self.max_len, return_token_type_ids=True)
seq, attn_masks, segment_ids = claim_evidence_in_tokens['input_ids'].squeeze(0), claim_evidence_in_tokens[
'attention_mask'].squeeze(0), claim_evidence_in_tokens['token_type_ids'].squeeze(0)
return seq, attn_masks, segment_ids, claim_id
class CFEVERLabelClassifier(nn.Module):
def __init__(self):
super(CFEVERLabelClassifier, self).__init__()
# Instantiating BERT model object
self.bert = BertModel.from_pretrained('bert-base-uncased')
# Classification layer
# input dimension is 768 because [CLS] embedding has a dimension of 768, if bert base is used
# output dimension is 1 because we're working with a binary classification problem - RELEVANT : NOT RELEVANT
self.cls_layer = nn.Linear(d_bert_base, num_of_classes)
def forward(self, seq, attn_masks, segment_ids):
'''
Inputs:
-seq : Tensor of shape [B, T] containing token ids of sequences
-attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
-segment_ids : Tensor of shape [B, T] containing token ids of segment embeddings (see BERT paper for more details)
'''
# Feeding the input to BERT model to obtain contextualized representations
outputs = self.bert(seq, attention_mask=attn_masks, token_type_ids=segment_ids, return_dict=True)
cont_reps = outputs.last_hidden_state
# Obtaining the representation of [CLS] head (the first token)
cls_rep = cont_reps[:, 0]
# Feeding cls_rep to the classifier layer
logits = self.cls_layer(cls_rep)
return logits # logits shape is [B, num_of_classes]
def train_claim_cls(net, loss_criterion, opti, train_loader, dev_loader, dev_claims, gpu, max_eps=num_epoch):
best_acc = 0
mean_losses = [0] * max_eps
for ep in range(max_eps):
net.train() # Good practice to set the mode of the model
st = time.time()
train_acc = 0
count = 0
for i, (b_seq, b_attn_masks, b_segment_ids, b_label) in enumerate(train_loader):
# Reset/Clear gradients
opti.zero_grad()
# Extracting the tokens ids, attention masks and token type ids
b_seq, b_attn_masks, b_segment_ids, b_label = b_seq.cuda(gpu), b_attn_masks.cuda(gpu), b_segment_ids.cuda(gpu), b_label.cuda(gpu)
# Obtaining the logits from the model
logits = net(b_seq, b_attn_masks, b_segment_ids)
# Computing loss
loss = loss_criterion(logits, b_label)
mean_losses[ep] += loss.item()
count += 1
train_acc += get_accuracy_from_logits(logits, b_label)
# Backpropagating the gradients, account for gradients
loss.backward()
# Optimization step, apply the gradients
opti.step()
if i % 100 == 0:
print("Iteration {} of epoch {} complete. Time taken (s): {}".format(i, ep, (time.time() - st)))
st = time.time()
mean_losses[ep] /= count
print(f"Epoch {ep} completed. Loss: {mean_losses[ep]}, Accuracy: {train_acc / count}.")
dev_acc = evaluate_dev(net, dev_loader, dev_claims, gpu)
print("\nEpoch {} complete! Development Accuracy on dev claim labels: {}.".format(ep, dev_acc))
if dev_acc > best_acc:
print("Best development accuracy improved from {} to {}, saving model...\n".format(best_acc, dev_acc))
best_acc = dev_acc
torch.save(net.state_dict(), clc_model_params_filename)
else:
print()
return mean_losses
def get_accuracy_from_logits(logits, labels):
probs = F.softmax(logits, dim=-1)
predicted_classes = torch.argmax(probs, dim=1)
acc = (predicted_classes.squeeze() == labels).float().mean()
return acc
def get_predictions_from_logits(logits):
probs = F.softmax(logits, dim=-1)
predicted_classes = torch.argmax(probs, dim=1)
return predicted_classes.squeeze()
def predict(net, dataloader, gpu):
net.eval()
claim_labels = {}
df = pd.DataFrame()
with torch.no_grad():
for b_seq, b_attn_masks, b_segment_ids, b_claim_id in dataloader:
b_seq, b_attn_masks, b_segment_ids, = b_seq.cuda(gpu), b_attn_masks.cuda(gpu), b_segment_ids.cuda(gpu)
logits = net(b_seq, b_attn_masks, b_segment_ids)
preds = get_predictions_from_logits(logits)
df = pd.concat([df, pd.DataFrame({'claim_ids': b_claim_id, 'preds': preds.cpu()})], ignore_index=True)
for _, row in df.iterrows():
claim_id = row['claim_ids']
label = row['preds']
claim_labels[claim_id] = label_mapper_itol[label]
return claim_labels
def evaluate_dev(net, dataloader, dev_claims, gpu):
claim_labels = predict(net, dataloader, gpu)
correct_labels = 0
for claim_id in dev_claims:
if claim_labels[claim_id] == dev_claims[claim_id]["claim_label"]:
correct_labels += 1
return correct_labels / len(dev_claims) # claim label accuracy
def extract_claim_evi_labels(test_claims, claim_labels, output_filename):
for claim in claim_labels:
test_claims[claim]["claim_label"] = claim_labels[claim]
with open(output_filename, 'w') as f:
json.dump(test_claims, f)
print("Final test claims predictions file ready.")
return test_claims
def clc_pipeline(train_claims, dev_claims, evidences):
net_clc = CFEVERLabelClassifier()
net_clc.cuda(gpu) # Enable gpu support for the model
class_counts = Counter([train_claims[claim]["claim_label"] for claim in train_claims])
class_weights = torch.tensor([(sum(class_counts.values()) / class_counts[c]) for c in label_mapper_ltoi.keys()])
loss_criterion = nn.CrossEntropyLoss(weight=class_weights).cuda(gpu)
opti_clc = optim.Adam(net_clc.parameters(), lr=opti_lr_clc)
train_set = CFEVERLabelTrainDataset(train_claims, evidences)
dev_set = CFEVERLabelTestDataset(dev_claims, evidences)
train_loader = DataLoader(train_set, batch_size=loader_batch_size, num_workers=loader_worker_num)
dev_loader = DataLoader(dev_set, batch_size=loader_batch_size, num_workers=loader_worker_num)
train_claim_cls(net_clc, loss_criterion, opti_clc, train_loader, dev_loader, dev_claims, gpu)
net_clc.load_state_dict(torch.load(clc_model_params_filename))
return net_clc
if __name__ == '__main__':
pass