Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Gradient accumulation #3512

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9573e55
Added default predictor for bimpm model
scarecrow1123 Nov 5, 2018
563647a
Merge pull request #1 from allenai/master
scarecrow1123 Dec 10, 2018
5ccf8ad
Merge pull request #2 from allenai/master
scarecrow1123 Jan 1, 2019
aaeddef
Merge pull request #3 from allenai/master
scarecrow1123 Apr 15, 2019
96db26f
Fix #2717: Add day count in training duration
scarecrow1123 Apr 15, 2019
64552a0
Modify elapsed time format to use `timedelta`
scarecrow1123 Apr 16, 2019
d0ac4ca
Merge branch 'master' of git://github.com/allenai/allennlp
scarecrow1123 Apr 17, 2019
383bf6d
Add gradient accumulation support
scarecrow1123 Apr 17, 2019
ad94b1f
Add doc for gradient accumulation
scarecrow1123 Apr 17, 2019
a33865b
Fix linter errors
scarecrow1123 Apr 17, 2019
f7a8ff7
Fix gradient accumulation to work for multi GPU
scarecrow1123 Apr 23, 2019
6f45aa3
Add test for gradient accumulation
scarecrow1123 Apr 23, 2019
955e2c4
Merge 'upstream/master' into gradient-accumulation
scarecrow1123 Apr 23, 2019
e2e48e3
Rename `num_batch_groups` and clarify usage
scarecrow1123 Apr 24, 2019
12a87ca
Add comments to clarify gradient accumulation
scarecrow1123 Apr 24, 2019
71398f1
Add more checks in gradient accumulation test
scarecrow1123 Apr 24, 2019
07cd9b2
Fix linter error
scarecrow1123 Apr 24, 2019
c66557f
Merge remote-tracking branch 'origin/master' into GradientAccumulation
dirkgr Dec 11, 2019
b1dbf6d
Remove unused parameter
dirkgr Dec 11, 2019
43fc57b
Comment changes
dirkgr Dec 11, 2019
2a45ecf
Typo
dirkgr Dec 11, 2019
b6c47ff
More concise comment
dirkgr Dec 11, 2019
6c697ea
Scale the loss appropriately to the number of gradient accumulation s…
dirkgr Dec 11, 2019
1cc0f87
Fixes the name of the config option
dirkgr Dec 11, 2019
c56006f
Productivity through formatting
dirkgr Dec 11, 2019
437b490
Fix training loss
dirkgr Dec 11, 2019
4659312
More formatting
dirkgr Dec 11, 2019
83087fa
Remove overeager warning
dirkgr Dec 12, 2019
a7fc41b
Be more lenient about variable batch sizes
dirkgr Dec 12, 2019
7da83de
Revert "Be more lenient about variable batch sizes"
dirkgr Dec 12, 2019
e7935c5
Fix for infinite iterator
dirkgr Dec 16, 2019
74ee2b3
Cut batches in half
dirkgr Dec 16, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion allennlp/common/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def sanitize(x: Any) -> Any:
def group_by_count(iterable: List[Any], count: int, default_value: Any) -> List[List[Any]]:
"""
Takes a list and groups it into sublists of size ``count``, using ``default_value`` to pad the
list at the end if the list is not divisable by ``count``.
list at the end if the list is not divisible by ``count``.

For example:
>>> group_by_count([1, 2, 3, 4, 5, 6, 7], 3, 0)
Expand Down
27 changes: 27 additions & 0 deletions allennlp/tests/training/trainer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import re
import time
from typing import Dict
from pathlib import Path

import torch
import pytest
Expand Down Expand Up @@ -832,6 +833,32 @@ def test_restoring_works_with_older_checkpointing(self):
assert trainer._metric_tracker._best_so_far == 0.1
assert trainer._metric_tracker._epochs_with_no_improvement == 1

def test_trainer_can_run_gradient_accumulation(self):
with Path(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv").open() as input_file:
num_training_instances = sum(1 for i in input_file)

steps_to_accumulate = 2

trainer = Trainer(
self.model,
self.optimizer,
self.iterator,
self.instances,
validation_dataset=self.instances,
num_epochs=2,
num_gradient_accumulation_steps=steps_to_accumulate,
)
assert trainer._num_gradient_accumulation_steps == steps_to_accumulate

metrics = trainer.train()

num_batches_trained_per_epoch = trainer._batch_num_total // (metrics["training_epochs"] + 1)
num_batches_expected = (
num_training_instances // self.iterator._batch_size // steps_to_accumulate
)

assert num_batches_trained_per_epoch == num_batches_expected


class TestSparseClipGrad(AllenNlpTestCase):
def test_sparse_clip_grad(self):
Expand Down
37 changes: 28 additions & 9 deletions allennlp/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def __init__(
should_log_learning_rate: bool = False,
log_batch_size_period: Optional[int] = None,
moving_average: Optional[MovingAverage] = None,
num_gradient_accumulation_steps: int = 1,
) -> None:
"""
A trainer for doing supervised learning. It just takes a labeled dataset
Expand Down Expand Up @@ -171,6 +172,10 @@ def __init__(
parameters. Be careful that when saving the checkpoint, we will save the moving averages of
parameters. This is necessary because we want the saved model to perform as well as the validated
model if we load it later. But this may cause problems if you restart the training from checkpoint.
num_gradient_accumulation_steps: ``int``, optional, (default = 1)
Gradients are accumulated for the given number of steps before doing an optimizer step. This can
be useful to accommodate batches that are larger than the RAM size. Refer Thomas Wolf's
[post](https://tinyurl.com/y5mv44fw) for details on Gradient Accumulation.
"""
super().__init__(serialization_dir, cuda_device)

Expand Down Expand Up @@ -250,6 +255,8 @@ def __init__(

self._last_log = 0.0 # time of last logging

self._num_gradient_accumulation_steps = num_gradient_accumulation_steps

# Enable activation logging.
if histogram_interval is not None:
self._tensorboard.enable_activation_logging(self.model)
Expand Down Expand Up @@ -300,12 +307,18 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:
# Set the model to "train" mode.
self.model.train()

num_gpus = len(self._cuda_devices)
# A `batch_group` has chunks of tensors that form a single batch together for an optimizer
# step. A single chunk always contains as many instances as configured in the iterator's
# `batch_size` param. The number of chunks in a single `batch_group` is
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't precisely accurate. Iterators create batches that aren't batch_size for many reasons. Maybe say something like, for the "simple case of a BasicIterator".

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"typically contains"?

What are the other possibilities? Gradient accumulation becomes more complicated when the batches aren't all the same size, and this code doesn't handle that case properly. Neither does the multi-GPU code.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maximum_samples_per_batch being the relevant config option.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of that LM we also scale our loss internally based on the number of tokens.

# `num_gradient_accumulation_steps` * `num_gpus`.
batch_group_length = self._num_gradient_accumulation_steps * len(self._cuda_devices)

# Get tqdm for the training batches
raw_train_generator = self.iterator(self.train_data, num_epochs=1, shuffle=self.shuffle)
train_generator = lazy_groups_of(raw_train_generator, num_gpus)
num_training_batches = math.ceil(self.iterator.get_num_batches(self.train_data) / num_gpus)
train_generator = lazy_groups_of(raw_train_generator, batch_group_length)
num_training_batches = math.ceil(
self.iterator.get_num_batches(self.train_data) / batch_group_length
)
self._last_log = time.time()
last_save_time = time.time()

Expand All @@ -325,14 +338,18 @@ def _train_epoch(self, epoch: int) -> Dict[str, float]:

self.optimizer.zero_grad()

loss = self.batch_loss(batch_group, for_training=True)

if torch.isnan(loss):
raise ValueError("nan loss encountered")
batches_for_step = list(lazy_groups_of(iter(batch_group), len(self._cuda_devices)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The conversion list seems unnecessary.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's needed because I need to know how many there are to scale the loss.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

for batch_for_step in batches_for_step:
loss = self.batch_loss(batch_for_step, for_training=True)
if torch.isnan(loss):
raise ValueError("nan loss encountered")

loss.backward()
# `len(batches_for_step)` should always be `num_gradient_accumulation_steps`, except
# for the last batch in the epoch.
loss = loss / len(batches_for_step)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be good if the loss per sub-batch was scaled relative to it's proportion of the overall gradient accumulated batch - e.g a batch of size 64 and a batch of size 12 would get weighted evenly here. You can do this with training_util.get_batch_size(batch)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure we want this? Sometimes we're already scaling by sample size. For instance, https://github.com/allenai/allennlp/blob/master/allennlp/models/language_model.py#L322.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh! Sorry, my bad. Disregard.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm trying this right now.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in a7fc41b.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dirkgr, I might not have been clear. We shouldn't do this. It breaks cases where users have scaled by sample size in their models.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After a long discussion, I reverted a7fc41b. Looks like we're going to break somebody, but this at least keeps the more common cases the same.

loss.backward()

train_loss += loss.item()
train_loss += loss.item()

batch_grad_norm = self.rescale_gradients()

Expand Down Expand Up @@ -697,6 +714,7 @@ def from_params( # type: ignore
grad_clipping = params.pop_float("grad_clipping", None)
lr_scheduler_params = params.pop("learning_rate_scheduler", None)
momentum_scheduler_params = params.pop("momentum_scheduler", None)
num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1)

if isinstance(cuda_device, list):
model_device = cuda_device[0]
Expand Down Expand Up @@ -779,4 +797,5 @@ def from_params( # type: ignore
should_log_learning_rate=should_log_learning_rate,
log_batch_size_period=log_batch_size_period,
moving_average=moving_average,
num_gradient_accumulation_steps=num_gradient_accumulation_steps,
)