Merge pull request BVLC#6 from zhirongw/googlenet

Get update accumulation from me
puzzledqs · Oct 7, 2014 · 54b40b3 · 54b40b3
2 parents c2aaebd + 069c0aa
commit 54b40b3
Show file tree

Hide file tree

Showing 20 changed files with 2,921 additions and 17 deletions.
diff --git a/examples/imagenet/create_imagenet.sh b/examples/imagenet/create_imagenet.sh
@@ -6,12 +6,13 @@ EXAMPLE=examples/imagenet
 DATA=data/ilsvrc12
 TOOLS=build/tools
 
-TRAIN_DATA_ROOT=/path/to/imagenet/train/
-VAL_DATA_ROOT=/path/to/imagenet/val/
+DEST=/home/common/imagenet
+TRAIN_DATA_ROOT=/home/common/imagenet/train/
+VAL_DATA_ROOT=/home/common/imagenet/val/
 
 # Set RESIZE=true to resize the images to 256x256. Leave as false if images have
 # already been resized using another tool.
-RESIZE=false
+RESIZE=true
 if $RESIZE; then
   RESIZE_HEIGHT=256
   RESIZE_WIDTH=256
@@ -42,7 +43,7 @@ GLOG_logtostderr=1 $TOOLS/convert_imageset \
     --shuffle \
     $TRAIN_DATA_ROOT \
     $DATA/train.txt \
-    $EXAMPLE/ilsvrc12_train_lmdb
+    $DEST/ilsvrc12_train_lmdb
 
 echo "Creating val lmdb..."
 
@@ -52,6 +53,6 @@ GLOG_logtostderr=1 $TOOLS/convert_imageset \
     --shuffle \
     $VAL_DATA_ROOT \
     $DATA/val.txt \
-    $EXAMPLE/ilsvrc12_val_lmdb
+    $DEST/ilsvrc12_val_lmdb
 
 echo "Done."
diff --git a/examples/imagenet/make_imagenet_mean.sh b/examples/imagenet/make_imagenet_mean.sh
@@ -2,7 +2,7 @@
 # Compute the mean image from the imagenet training leveldb
 # N.B. this is available in data/ilsvrc12
 
-./build/tools/compute_image_mean examples/imagenet/ilsvrc12_train_leveldb \
+./build/tools/compute_image_mean /home/common/imagenet/ilsvrc12_train_lmdb \
   data/ilsvrc12/imagenet_mean.binaryproto
 
 echo "Done."
diff --git a/examples/imagenet/train_caffenet.sh b/examples/imagenet/train_caffenet.sh
@@ -1,4 +1,6 @@
 #!/usr/bin/env sh
 
+GOOGLE_LOG_DIR=models/bvlc_reference_caffenet \
 ./build/tools/caffe train \
-    --solver=models/bvlc_reference_caffenet/solver.prototxt
+    --solver=models/bvlc_reference_caffenet/solver.prototxt \
+    --gpu=2
diff --git a/examples/imagenet/train_googlenet.sh b/examples/imagenet/train_googlenet.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+
+GOOGLE_LOG_DIR=models/googlenet \
+./build/tools/caffe train \
+    --solver=models/googlenet/solver.prototxt \
+    --gpu=3
diff --git a/examples/imagenet/train_googlenet_gpus.sh b/examples/imagenet/train_googlenet_gpus.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+
+GLOG_alsologtostderr=1 \
+GOOGLE_LOG_DIR=models/googlenet \
+./build/examples/parallel/gpus.bin \
+    models/googlenet/solver.prototxt 1:2
diff --git a/examples/imagenet/train_vgg.sh b/examples/imagenet/train_vgg.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env sh
+
+GOOGLE_LOG_DIR=models/vgg \
+./build/tools/caffe train \
+    --solver=models/vgg/solver.prototxt \
+    --gpu=1
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
@@ -94,10 +94,14 @@ class Blob {
   const Dtype* gpu_data() const;
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
+  const Dtype* cpu_acum_diff() const;
+  const Dtype* gpu_acum_diff() const;
   Dtype* mutable_cpu_data();
   Dtype* mutable_gpu_data();
   Dtype* mutable_cpu_diff();
   Dtype* mutable_gpu_diff();
+  Dtype* mutable_gpu_acum_diff();
+  Dtype* mutable_cpu_acum_diff();
   void Update();
   void FromProto(const BlobProto& proto);
   void ToProto(BlobProto* proto, bool write_diff = false) const;
@@ -107,6 +111,10 @@ class Blob {
   /// @brief Compute the sum of absolute values (L1 norm) of the diff.
   Dtype asum_diff() const;
 
+  // added for allowing bigger batch_size
+  void AccumulateDiff();
+  void UpdateDiff();
+
   /**
    * @brief Set the data_ shared_ptr to point to the SyncedMemory holding the
    *        data_ of Blob other -- useful in Layer&s which simply perform a copy
@@ -129,6 +137,7 @@ class Blob {
  protected:
   shared_ptr<SyncedMemory> data_;
   shared_ptr<SyncedMemory> diff_;
+  shared_ptr<SyncedMemory> acum_diff_;
   int num_;
   int channels_;
   int height_;

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -127,14 +127,18 @@ class Caffe {
   static void SetDevice(const int device_id);
   // Prints the current GPU status.
   static void DeviceQuery();
+  // added for allowing bigger batch size
+  inline static void set_accumulate(bool acum) { Get().accumulate_ = acum; }
+  inline static bool accumulate() { return Get().accumulate_; }
 
  protected:
 #ifndef CPU_ONLY
   cublasHandle_t cublas_handle_;
   curandGenerator_t curand_generator_;
 #endif
   shared_ptr<RNG> random_generator_;
-
+  // added for allowing bigger batch size
+  bool accumulate_;
   Brew mode_;
   Phase phase_;
   static shared_ptr<Caffe> singleton_;

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
@@ -85,6 +85,9 @@ class Net {
   /// @brief Updates the network weights based on the diff values computed.
   void Update();
 
+  // added for allowing large batch size
+  void AccumulateDiff();
+  void UpdateDiff();
   /**
    * @brief For an already initialized net, implicitly copies (i.e., using no
    *        additional memory) the pre-trained layers from another Net.

diff --git a/models/bvlc_reference_caffenet/solver.prototxt b/models/bvlc_reference_caffenet/solver.prototxt
@@ -1,14 +1,17 @@
 net: "models/bvlc_reference_caffenet/train_val.prototxt"
 test_iter: 1000
 test_interval: 1000
-base_lr: 0.01
+base_lr: 0.004
 lr_policy: "step"
 gamma: 0.1
 stepsize: 100000
 display: 20
 max_iter: 450000
-momentum: 0.9
+momentum: 0.95
 weight_decay: 0.0005
 snapshot: 10000
 snapshot_prefix: "models/bvlc_reference_caffenet/caffenet_train"
 solver_mode: GPU
+test_initialization: false
+debug_info: true
+debug_display: 1000
diff --git a/models/bvlc_reference_caffenet/train_val.prototxt b/models/bvlc_reference_caffenet/train_val.prototxt
@@ -5,9 +5,9 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    source: "/home/common/imagenet/ilsvrc12_train_lmdb"
     backend: LMDB
-    batch_size: 256
+    batch_size: 128
   }
   transform_param {
     crop_size: 227
@@ -22,7 +22,7 @@ layers {
   top: "data"
   top: "label"
   data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    source: "/home/common/imagenet/ilsvrc12_val_lmdb"
     backend: LMDB
     batch_size: 50
   }

diff --git a/models/googlenet/solver.prototxt b/models/googlenet/solver.prototxt
@@ -0,0 +1,18 @@
+net: "models/googlenet/train_val.prototxt"
+test_iter: 1000
+test_interval: 1000
+update_interval: 2
+base_lr: 0.0002
+lr_policy: "step"
+gamma: 0.98
+stepsize: 10
+display: 20
+max_iter: 450000
+momentum: 0.9
+weight_decay: 0.0005
+snapshot: 10000
+snapshot_prefix: "models/googlenet/googlenet_train"
+solver_mode: GPU
+test_initialization: false
+debug_info: true
+debug_display: 1000