From 778688d359f826e5204939300845314475daef39 Mon Sep 17 00:00:00 2001 From: Evan Shelhamer Date: Thu, 28 May 2015 12:43:29 -0700 Subject: [PATCH] directly normalize accumulated gradients `SGDSolver::Normalize()` normalizes accumulated gradients by scaling inversely to the accumulation as `1 / iter_size`. This is more obvious than fooling with rates and decays in 55585f5. --- include/caffe/solver.hpp | 1 + src/caffe/solver.cpp | 32 +++++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index da1bab13663..c2ced487d6f 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -81,6 +81,7 @@ class SGDSolver : public Solver { void PreSolve(); Dtype GetLearningRate(); virtual void ApplyUpdate(); + virtual void Normalize(int param_id); virtual void Regularize(int param_id); virtual void ComputeUpdateValue(int param_id, Dtype rate); virtual void ClipGradients(); diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 4c8fa25c955..aabe0edec80 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -487,12 +487,39 @@ void SGDSolver::ApplyUpdate() { } ClipGradients(); for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { + Normalize(param_id); Regularize(param_id); - ComputeUpdateValue(param_id, rate / this->param_.iter_size()); + ComputeUpdateValue(param_id, rate); } this->net_->Update(); } +template +void SGDSolver::Normalize(int param_id) { + if (this->param_.iter_size() == 1) { return; } + // Scale gradient to counterbalance accumulation. + const vector > >& net_params = this->net_->params(); + const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); + switch (Caffe::mode()) { + case Caffe::CPU: { + caffe_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_cpu_diff()); + break; + } + case Caffe::GPU: { +#ifndef CPU_ONLY + caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, + net_params[param_id]->mutable_gpu_diff()); +#else + NO_GPU; +#endif + break; + } + default: + LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); + } +} + template void SGDSolver::Regularize(int param_id) { const vector > >& net_params = this->net_->params(); @@ -500,8 +527,7 @@ void SGDSolver::Regularize(int param_id) { this->net_->params_weight_decay(); Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); - Dtype local_decay = weight_decay * net_params_weight_decay[param_id] - * this->param_.iter_size(); + Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) {