forked from tensorflow/tensor2tensor
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add MultistepAdamOptimizer: Large training batches on limited GPU har…
…dware (tensorflow#754) Simulates n times more GPUs at cost of n times more training iterations
- Loading branch information
1 parent
f08cd6f
commit b7ab5b7
Showing
6 changed files
with
278 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# coding=utf-8 | ||
# Copyright 2018 The Tensor2Tensor Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Optimizer variants which make it possible to use very large batch sizes with | ||
limited GPU memory. Optimizers in this module accumulate the gradients for n | ||
batches, and call the optimizer's update rule every n batches with the | ||
accumulated gradients. | ||
See [Saunders et al., 2018](https://arxiv.org/abs/1805.00456) for details. | ||
""" | ||
|
||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
# Dependency imports | ||
|
||
import tensorflow as tf | ||
|
||
|
||
class MultistepAdamOptimizer(tf.train.AdamOptimizer): | ||
"""Adam with SGD updates every n steps with accumulated gradients.""" | ||
|
||
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, | ||
use_locking=False, name="Adam", n=1): | ||
super(MultistepAdamOptimizer, self).__init__( | ||
learning_rate=learning_rate, beta1=beta1, beta2=beta2, epsilon=epsilon, | ||
use_locking=use_locking, name=name) | ||
self._n = n # Call Adam optimizer every n batches with accumulated grads | ||
self._n_t = None # n as tensor | ||
|
||
def _create_slots(self, var_list): | ||
"""Create slot variables for Adam with accumulated gradients. | ||
Like super class method, but additionally creates slots for the gradient | ||
accumulator `acc_grad` and the counter variable. | ||
""" | ||
super(MultistepAdamOptimizer, self)._create_slots(var_list) | ||
first_var = min(var_list, key=lambda x: x.name) | ||
self._create_non_slot_variable(initial_value=0 if self._n == 1 else 1, | ||
name="iter", | ||
colocate_with=first_var) | ||
for v in var_list: | ||
self._zeros_slot(v, "grad_acc", self._name) | ||
|
||
def _get_iter_variable(self): | ||
if tf.contrib.eager.in_eager_mode(): | ||
graph = None | ||
else: | ||
graph = tf.get_default_graph() | ||
return self._get_non_slot_variable("iter", graph=graph) | ||
|
||
def _prepare(self): | ||
super(MultistepAdamOptimizer, self)._prepare() | ||
self._n_t = tf.convert_to_tensor(self._n, name="n") | ||
|
||
def _apply_cond(self, apply_fn, grad, var, *args, **kwargs): | ||
"""Conditionally apply or accumulate gradient. | ||
Call `apply_fn only if the current counter value (iter) is zero. This | ||
method couples common functionality for all _apply_*() implementations | ||
in Adam. | ||
""" | ||
grad_acc = self.get_slot(var, "grad_acc") | ||
|
||
def apply_adam(grad_acc, apply_fn, grad, var, *args, **kwargs): | ||
total_grad = (grad_acc + grad) / tf.cast(self._n_t, grad.dtype) | ||
adam_op = apply_fn(total_grad, var, *args, **kwargs) | ||
with tf.control_dependencies([adam_op]): | ||
grad_acc_to_zero_op = grad_acc.assign(tf.zeros_like(grad_acc), | ||
use_locking=self._use_locking) | ||
return tf.group(adam_op, grad_acc_to_zero_op) | ||
|
||
def accumulate_gradient(grad_acc, grad): | ||
assign_op = tf.assign_add(grad_acc, grad, use_locking=self._use_locking) | ||
return tf.group(assign_op) # Strip return value | ||
|
||
return tf.cond(tf.equal(self._get_iter_variable(), 0), | ||
lambda: apply_adam( | ||
grad_acc, apply_fn, grad, var, *args, **kwargs), | ||
lambda: accumulate_gradient(grad_acc, grad)) | ||
|
||
def _apply_dense(self, grad, var): | ||
return self._apply_cond( | ||
super(MultistepAdamOptimizer, self)._apply_dense, grad, var) | ||
|
||
def _resource_apply_dense(self, grad, var): | ||
return self._apply_cond( | ||
super(MultistepAdamOptimizer, self)._resource_apply_dense, grad, var) | ||
|
||
def _apply_sparse_shared(self, grad, var, indices, scatter_add): | ||
return self._apply_cond( | ||
super(MultistepAdamOptimizer, self)._apply_sparse_shared, grad, var, | ||
indices, scatter_add) | ||
|
||
def _apply_sparse(self, grad, var): | ||
# TODO: Implement a sparse version | ||
dense_grad = tf.convert_to_tensor(grad) | ||
return self._apply_cond( | ||
super(MultistepAdamOptimizer, self)._apply_dense, dense_grad, var) | ||
|
||
def _finish(self, update_ops, name_scope): | ||
"""Like super class method, but updates beta_power variables only every | ||
n batches. The iter variable is updated with | ||
iter <- iter + 1 mod n | ||
""" | ||
iter_ = self._get_iter_variable() | ||
beta1_power, beta2_power = self._get_beta_accumulators() | ||
with tf.control_dependencies(update_ops): | ||
with tf.colocate_with(iter_): | ||
|
||
def update_beta_op(): | ||
update_beta1 = beta1_power.assign( | ||
beta1_power * self._beta1_t, | ||
use_locking=self._use_locking) | ||
update_beta2 = beta2_power.assign( | ||
beta2_power * self._beta2_t, | ||
use_locking=self._use_locking) | ||
return tf.group(update_beta1, update_beta2) | ||
maybe_update_beta = tf.cond( | ||
tf.equal(iter_, 0), update_beta_op, tf.no_op) | ||
with tf.control_dependencies([maybe_update_beta]): | ||
update_iter = iter_.assign(tf.mod(iter_ + 1, self._n_t), | ||
use_locking=self._use_locking) | ||
return tf.group( | ||
*update_ops + [update_iter, maybe_update_beta], name=name_scope) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
# coding=utf-8 | ||
# Copyright 2018 The Tensor2Tensor Authors. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
"""Multi-step Optimizer Test Module for TensorFlow.""" | ||
|
||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
|
||
# Dependency imports | ||
|
||
import numpy as np | ||
import tensorflow as tf | ||
from tensor2tensor.utils.multistep_optimizer import MultistepAdamOptimizer | ||
|
||
|
||
class MultistepAdamOptimizerTest(tf.test.TestCase): | ||
|
||
def testMultistep(self): | ||
ver = tf.__version__.split('.') | ||
# TODO: Remove version check once 1.5 is not tested anymore | ||
if int(ver[0]) <= 1 and int(ver[1]) < 6: | ||
# MultistepAdamOptimizer requires TF >= 1.6 | ||
return | ||
dtype = tf.float32 | ||
beta1 = 0.2 | ||
beta2 = 0.99 | ||
alpha = 10.0 | ||
grads0_np_lst = [ | ||
np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype), | ||
np.array([0.2, -0.1], dtype=dtype.as_numpy_dtype), | ||
np.array([0.3, 0.1], dtype=dtype.as_numpy_dtype), | ||
np.array([0.4, -0.1], dtype=dtype.as_numpy_dtype) | ||
] | ||
grads1_np_lst = [ | ||
np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype), | ||
np.array([0.02, 0.02], dtype=dtype.as_numpy_dtype), | ||
np.array([-0.04, 0.04], dtype=dtype.as_numpy_dtype), | ||
np.array([-0.04, 0.06], dtype=dtype.as_numpy_dtype) | ||
] | ||
var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype) | ||
var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype) | ||
# Test accumulating gradients for n=1..4 steps | ||
for n in range(1, 5): | ||
with self.test_session(): | ||
with self.test_session(graph=tf.Graph()): | ||
singlestep_var0 = tf.Variable(var0_np) | ||
singlestep_var1 = tf.Variable(var1_np) | ||
|
||
multistep_var0 = tf.Variable(var0_np) | ||
multistep_var1 = tf.Variable(var1_np) | ||
|
||
singlestep_opt = tf.train.AdamOptimizer( | ||
beta1=beta1, beta2=beta2, learning_rate=alpha) | ||
multistep_opt = MultistepAdamOptimizer( | ||
n=n, beta1=beta1, beta2=beta2, learning_rate=alpha) | ||
|
||
singlestep_update = singlestep_opt.apply_gradients([ | ||
(tf.constant(sum(grads0_np_lst[:n]) / n), singlestep_var0), | ||
(tf.constant(sum(grads1_np_lst[:n]) / n), singlestep_var1)]) | ||
multistep_updates = [ | ||
multistep_opt.apply_gradients([(tf.constant(g0), multistep_var0), | ||
(tf.constant(g1), multistep_var1)]) | ||
for g0, g1 in zip(grads0_np_lst, grads1_np_lst)][:n] | ||
|
||
self.evaluate(tf.global_variables_initializer()) | ||
(singlestep_beta1_power, | ||
singlestep_beta2_power) = singlestep_opt._get_beta_accumulators() | ||
(multistep_beta1_power, | ||
multistep_beta2_power) = multistep_opt._get_beta_accumulators() | ||
|
||
# Run 3 steps of Adam | ||
for _ in range(1, 4): | ||
self.evaluate(singlestep_update) | ||
for multistep_update in multistep_updates: | ||
self.evaluate(multistep_update) | ||
|
||
self.assertAllCloseAccordingToType( | ||
self.evaluate(singlestep_beta1_power), | ||
self.evaluate(multistep_beta1_power)) | ||
self.assertAllCloseAccordingToType( | ||
self.evaluate(singlestep_beta2_power), | ||
self.evaluate(multistep_beta2_power)) | ||
# Validate updated params | ||
self.assertAllCloseAccordingToType( | ||
self.evaluate(singlestep_var0), | ||
self.evaluate(multistep_var0)) | ||
self.assertAllCloseAccordingToType( | ||
self.evaluate(singlestep_var1), | ||
self.evaluate(multistep_var1)) | ||
|
||
|
||
if __name__ == "__main__": | ||
tf.test.main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters