-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrainer.py
141 lines (113 loc) · 5.37 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import logging
import math
import os
import torch
import torch.nn.functional as F
import torch.optim as optim
from data.textor import Textor
from data.translation_dataset import TranslationDataset
from optimizer import ScheduledOptim
from transformer.transformer import Transformer
from torch.utils.tensorboard import SummaryWriter
class Trainer:
def __init__(self, tokenizer, dataloader, transformer, optimizer, device):
self.logger = logging.getLogger('Trainer')
self.tokenizer = tokenizer
self.dataloader = dataloader
self.model = transformer
self.optimizer = optimizer
self.device = device
self.log_interval = 10
self.tb_writer = SummaryWriter(log_dir=os.path.join("data", 'tensorboard'))
def train(self, epochs):
for epoch in range(epochs):
self.logger.info("\n\n\nEpoch {}".format(epoch))
total_loss, loss_per_word, accuracy = self.train_epoch(smoothing=False, trg_pad_idx=self.tokenizer.TRG_VOCAB["<pad>"])
train_ppl = math.exp(min(loss_per_word, 100))
self.tb_writer.add_scalars('loss', {'train': loss_per_word}, epoch)
self.tb_writer.add_scalars('lr', {'train': self.optimizer.last_lr}, epoch)
self.tb_writer.add_scalars('accuracy', {'train': accuracy*100}, epoch)
def patch_trg(self, trg):
trg, gold = trg[:, :-1], trg[:, 1:].contiguous().view(-1)
return trg, gold
def train_epoch(self, smoothing, trg_pad_idx):
self.model.train()
total_loss, n_word_total, n_word_correct = 0, 0, 0
for batch_idx, (src_batch, trg_batch) in enumerate(self.dataloader):
src_batch = src_batch.to(self.device)
trg_batch = trg_batch.to(self.device)
loss, n_correct, n_word = self.train_batch(src_batch, trg_batch, smoothing, trg_pad_idx)
self.log_batch_result(batch_idx, loss, n_correct, n_word)
n_word_total += n_word
n_word_correct += n_correct
total_loss += loss.item()
loss_per_word = total_loss/n_word_total
self.logger.info("\n\n Epoch loss per word = {}".format(loss_per_word))
accuracy = n_word_correct/n_word_total
return total_loss, loss_per_word, accuracy
def train_batch(self, src_batch, trg_batch, smoothing, trg_pad_idx):
trg_batch, gold = self.patch_trg(trg_batch)
self.optimizer.zero_grad()
prediction = self.model(src_batch, trg_batch)
self.tb_writer.add_scalars('ppl', {'train': 1}, 1)
n_correct, n_word = self.calculate_perfomance(prediction, gold, trg_pad_idx)
loss = self.calculate_loss(prediction, gold, trg_pad_idx, smoothing=smoothing)
self.learn(loss,self.optimizer)
return loss, n_correct, n_word
def log_batch_result(self, batch_idx,loss, n_correct, n_word):
if self.is_time_to_log_batch_result(batch_idx, self.log_interval):
self.logger.info("batch index = {}".format(batch_idx))
self.logger.info("loss = {}".format(loss))
self.logger.info("n_correct = {}".format(n_correct))
self.logger.info("n_word = {}".format(n_word))
self.logger.info("accuracy = {}".format(n_correct/n_word))
def is_time_to_log_batch_result(self, batch_idx, log_interval):
return batch_idx % log_interval == 0
def learn(self, loss, optimizer):
loss.backward()
optimizer.step_and_update_lr()
def calculate_perfomance(self, pred, gold, trg_pad_idx):
''' Apply label smoothing if needed '''
pred = pred.max(1)[1]
gold = gold.contiguous().view(-1)
non_pad_mask = gold.ne(trg_pad_idx)
n_correct = pred.eq(gold).masked_select(non_pad_mask).sum().item()
n_word = non_pad_mask.sum().item()
return n_correct, n_word
def calculate_loss(self, pred, gold, trg_pad_idx, smoothing=False):
''' Calculate cross entropy loss, apply label smoothing if needed. '''
gold = gold.contiguous().view(-1)
self.logger.debug("gold continoues shape = {}".format(gold.shape))
self.logger.debug("pred {}".format(pred[0][0:10]))
self.logger.debug("gold {}".format(gold[0]))
if smoothing:
eps = 0.1
n_class = pred.size(1)
one_hot = torch.zeros_like(pred).scatter(1, gold.view(-1, 1), 1)
one_hot = one_hot * (1 - eps) + (1 - one_hot) * eps / (n_class - 1)
log_prb = F.log_softmax(pred, dim=1)
non_pad_mask = gold.ne(trg_pad_idx)
loss = -(one_hot * log_prb).sum(dim=1)
loss = loss.masked_select(non_pad_mask).sum() # average later
else:
loss = F.cross_entropy(pred, gold, ignore_index=trg_pad_idx, reduction='sum')
return loss
def optimizer_for_model(model):
lr_mul = 2
n_warmup_steps = 4000
return ScheduledOptim(
optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
lr_mul, model.d_model, n_warmup_steps)
def main():
textor = Textor("data/output/train.txt")
textor.build()
dataset = TranslationDataset(textor)
dataloader = dataset.loader(batch_size=10)
transformer = Transformer(textor)
transformer = transformer.to("cpu")
optimizer = optimizer_for_model(transformer)
trainer = Trainer(textor, dataloader, transformer, optimizer, device="cpu")
trainer.train(epochs=10)
if __name__ == "__main__":
logging.basicConfig(level=logging.DEBUG)
main()