adjust local learning rate and decay according to gradient accumulation

Divide local rate by `iter_size` to normalize the gradient according to the full minibatch size and not only the computational batch size. Multiply the local decay by `iter_size` to counter the division of the local learning rate since the decay is multiplied by the rate in the update equation.
BVLC · May 27, 2015 · 55585f5 · 55585f5
1 parent 67b1ff3
commit 55585f5
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
@@ -488,7 +488,7 @@ void SGDSolver<Dtype>::ApplyUpdate() {
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
     Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
+    ComputeUpdateValue(param_id, rate / this->param_.iter_size());
   }
   this->net_->Update();
 }
@@ -500,7 +500,8 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
       this->net_->params_weight_decay();
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id]
+                                   * this->param_.iter_size();
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {