-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtrain.py
executable file
·156 lines (136 loc) · 6.16 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
#! encoding=UTF-8
import argparse
import chainer
from chainer.datasets import TransformDataset
from chainer import iterators
from chainer import optimizers
from chainer import training
from chainer.training import extensions
import datasets
from model import ImageCaptionModel
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--out', type=str, default='result',
help='Output directory')
parser.add_argument('--dataset-name', type=str, default='mscoco', choices=["mscoco", "stair_captions"],
help='MSCOCO dataset root directory')
parser.add_argument('--mscoco-root', type=str, default='data',
help='MSCOCO dataset root directory')
parser.add_argument('--max-iters', type=int, default=50000,
help='Maximum number of iterations to train')
parser.add_argument('--batch-size', type=int, default=128,
help='Minibatch size')
parser.add_argument('--dropout-ratio', type=float, default=0.5,
help='Language model dropout ratio')
parser.add_argument('--val-keep-quantity', type=int, default=100,
help='Keep every N-th validation image')
parser.add_argument('--val-iter', type=int, default=100,
help='Run validation every N-th iteration')
parser.add_argument('--log-iter', type=int, default=1,
help='Log every N-th iteration')
parser.add_argument('--snapshot-iter', type=int, default=1000,
help='Model snapshot every N-th iteration')
parser.add_argument('--rnn', type=str, default='nsteplstm',
choices=['nsteplstm', 'lstm'],
help='Language model layer type')
parser.add_argument('--gpu', type=int, default=0,
help='GPU ID (negative value indicates CPU)')
parser.add_argument('--max-caption-length', type=int, default=30,
help='Maxium caption length when using LSTM layer')
args = parser.parse_args()
# Set path to annotation files
if args.dataset_name == "mscoco":
train_anno = "annotations/captions_train2014.json"
val_anno = "annotations/captions_val2014.json"
elif args.dataset_name == "stair_captions":
train_anno = "annotations/stair_captions_v1.2_train_tokenized.json"
val_anno = "annotations/stair_captions_v1.2_val_tokenized.json"
# Load the MSCOCO dataset. Assumes that the dataset has been downloaded
# already using e.g. the `download.py` script
train, val = datasets.get_mscoco(
args.mscoco_root, train_anno=train_anno, val_anno=val_anno, dataset_name=args.dataset_name)
# Validation samples are used to address overfitting and see how well your
# model generalizes to yet unseen data. However, since the number of these
# samples in MSCOCO is quite large (~200k) and thus require time to
# evaluate, you may choose to use only a fraction of the available samples
val = val[::args.val_keep_quantity]
# Number of unique words that are found in the dataset
vocab_size = len(train.vocab)
# Instantiate the model to be trained either with LSTM layers or with
# NStepLSTM layers
model = ImageCaptionModel(
vocab_size, dropout_ratio=args.dropout_ratio, rnn=args.rnn)
if args.gpu >= 0:
chainer.backends.cuda.get_device_from_id(args.gpu).use()
model.to_gpu()
def transform(in_data):
# Called for each sample and applies necessary preprocessing to the
# image such as resizing and normalizing
img, caption = in_data
img = model.prepare(img)
return img, caption
# We need to preprocess the images since their sizes may vary (and the
# model requires that they have the exact same fixed size)
train = TransformDataset(train, transform)
val = TransformDataset(val, transform)
train_iter = iterators.MultiprocessIterator(
train, args.batch_size, shared_mem=700000)
val_iter = chainer.iterators.MultiprocessIterator(
val, args.batch_size, repeat=False, shuffle=False, shared_mem=700000)
optimizer = optimizers.Adam()
optimizer.setup(model)
def converter(batch, device):
# The converted receives a batch of input samples any may modify it if
# necessary. In our case, we need to align the captions depending on if
# we are using LSTM layers of NStepLSTM layers in the model.
if args.rnn == 'lstm':
max_caption_length = args.max_caption_length
elif args.rnn == 'nsteplstm':
max_caption_length = None
else:
raise ValueError('Invalid RNN type.')
return datasets.converter(
batch, device, max_caption_length=max_caption_length)
updater = training.updater.StandardUpdater(
train_iter, optimizer=optimizer, device=args.gpu, converter=converter)
trainer = training.Trainer(
updater, out=args.out, stop_trigger=(args.max_iters, 'iteration'))
trainer.extend(
extensions.Evaluator(
val_iter,
target=model,
converter=converter,
device=args.gpu
),
trigger=(args.val_iter, 'iteration')
)
trainer.extend(
extensions.LogReport(
['main/loss', 'validation/main/loss'],
trigger=(args.log_iter, 'iteration')
)
)
trainer.extend(
extensions.PlotReport(
['main/loss', 'validation/main/loss'],
trigger=(args.log_iter, 'iteration')
)
)
trainer.extend(
extensions.PrintReport(
['elapsed_time', 'epoch', 'iteration', 'main/loss',
'validation/main/loss']
),
trigger=(args.log_iter, 'iteration')
)
# Save model snapshots so that later on, we can load them and generate new
# captions for any image. This can be done in the `predict.py` script
trainer.extend(
extensions.snapshot_object(model, 'model_{.updater.iteration}'),
trigger=(args.snapshot_iter, 'iteration')
)
trainer.extend(extensions.ProgressBar())
trainer.run()
if __name__ == '__main__':
main()