Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple the computational batch size and minibatch size by accumulating gradients #1663

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions include/caffe/test/test_gradient_check_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,14 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
CHECK_EQ(top_count, bottom[blob_id]->count());
}
}
// First, figure out what blobs we need to check against.
// First, figure out what blobs we need to check against, and zero init
// parameter blobs.
vector<Blob<Dtype>*> blobs_to_check;
vector<bool> propagate_down(bottom.size(), check_bottom < 0);
for (int i = 0; i < layer->blobs().size(); ++i) {
blobs_to_check.push_back(layer->blobs()[i].get());
Blob<Dtype>* blob = layer->blobs()[i].get();
caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
blobs_to_check.push_back(blob);
}
if (check_bottom < 0) {
for (int i = 0; i < bottom.size(); ++i) {
Expand Down
7 changes: 0 additions & 7 deletions src/caffe/layers/conv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = this->blobs_[0]->cpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
if (this->param_propagate_down_[0]) {
caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
if (this->bias_term_ && this->param_propagate_down_[1]) {
caffe_set(this->blobs_[1]->count(), Dtype(0),
this->blobs_[1]->mutable_cpu_diff());
}
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->cpu_diff();
const Dtype* bottom_data = bottom[i]->cpu_data();
Expand Down
7 changes: 0 additions & 7 deletions src/caffe/layers/conv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = this->blobs_[0]->gpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
if (this->param_propagate_down_[0]) {
caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
if (this->bias_term_ && this->param_propagate_down_[1]) {
caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
this->blobs_[1]->mutable_gpu_diff());
}
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->gpu_diff();
// Bias gradient, if necessary.
Expand Down
2 changes: 0 additions & 2 deletions src/caffe/layers/cudnn_conv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,10 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
if (this->param_propagate_down_[0]) {
weight = this->blobs_[0]->gpu_data();
weight_diff = this->blobs_[0]->mutable_gpu_diff();
caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
Dtype* bias_diff = NULL;
if (this->bias_term_ && this->param_propagate_down_[1]) {
bias_diff = this->blobs_[1]->mutable_gpu_diff();
caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
}
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->gpu_diff();
Expand Down
7 changes: 0 additions & 7 deletions src/caffe/layers/deconv_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = this->blobs_[0]->cpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
if (this->param_propagate_down_[0]) {
caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
if (this->bias_term_ && this->param_propagate_down_[1]) {
caffe_set(this->blobs_[1]->count(), Dtype(0),
this->blobs_[1]->mutable_cpu_diff());
}
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->cpu_diff();
const Dtype* bottom_data = bottom[i]->cpu_data();
Expand Down
7 changes: 0 additions & 7 deletions src/caffe/layers/deconv_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,6 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
const Dtype* weight = this->blobs_[0]->gpu_data();
Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
if (this->param_propagate_down_[0]) {
caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
}
if (this->bias_term_ && this->param_propagate_down_[1]) {
caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
this->blobs_[1]->mutable_gpu_diff());
}
for (int i = 0; i < top.size(); ++i) {
const Dtype* top_diff = top[i]->gpu_diff();
const Dtype* bottom_data = bottom[i]->gpu_data();
Expand Down
4 changes: 2 additions & 2 deletions src/caffe/layers/inner_product_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
const Dtype* bottom_data = bottom[0]->cpu_data();
// Gradient with respect to weight
caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
}
if (bias_term_ && this->param_propagate_down_[1]) {
const Dtype* top_diff = top[0]->cpu_diff();
// Gradient with respect to bias
caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
bias_multiplier_.cpu_data(), (Dtype)0.,
bias_multiplier_.cpu_data(), (Dtype)1.,
this->blobs_[1]->mutable_cpu_diff());
}
if (propagate_down[0]) {
Expand Down
4 changes: 2 additions & 2 deletions src/caffe/layers/inner_product_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const Dtype* bottom_data = bottom[0]->gpu_data();
// Gradient with respect to weight
caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
}
if (bias_term_ && this->param_propagate_down_[1]) {
const Dtype* top_diff = top[0]->gpu_diff();
// Gradient with respect to bias
caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
bias_multiplier_.gpu_data(), (Dtype)0.,
bias_multiplier_.gpu_data(), (Dtype)1.,
this->blobs_[1]->mutable_gpu_diff());
}
if (propagate_down[0]) {
Expand Down
3 changes: 2 additions & 1 deletion src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ message NetParameter {
// NOTE
// Update the next available ID when you add a new SolverParameter field.
//
// SolverParameter next available ID: 35 (last added: stepvalue)
// SolverParameter next available ID: 36 (last added: iter_size)
message SolverParameter {
//////////////////////////////////////////////////////////////////////////////
// Specifying the train and test networks
Expand Down Expand Up @@ -118,6 +118,7 @@ message SolverParameter {
// Display the loss averaged over the last average_loss iterations
optional int32 average_loss = 33 [default = 1];
optional int32 max_iter = 7; // the maximum number of iterations
optional int32 iter_size = 35 [default = 1];
optional string lr_policy = 8; // The learning rate decay policy.
optional float gamma = 9; // The parameter to compute the learning rate.
optional float power = 10; // The parameter to compute the learning rate.
Expand Down
33 changes: 30 additions & 3 deletions src/caffe/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,39 @@ void Solver<Dtype>::Step(int iters) {
Dtype smoothed_loss = 0;

for (; iter_ < stop_iter; ++iter_) {
// zero-init the params
for (int i = 0; i < net_->params().size(); ++i) {
shared_ptr<Blob<Dtype> > blob = net_->params()[i];
switch (Caffe::mode()) {
case Caffe::CPU:
caffe_set(blob->count(), static_cast<Dtype>(0),
blob->mutable_cpu_diff());
break;
case Caffe::GPU:
#ifndef CPU_ONLY
caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
blob->mutable_gpu_diff());
#else
NO_GPU;
#endif
break;
}
}

if (param_.test_interval() && iter_ % param_.test_interval() == 0
&& (iter_ > 0 || param_.test_initialization())) {
TestAll();
}

const bool display = param_.display() && iter_ % param_.display() == 0;
net_->set_debug_info(display && param_.debug_info());
Dtype loss = net_->ForwardBackward(bottom_vec);
// accumulate the loss and gradient
Dtype loss = 0;
for (int i = 0; i < param_.iter_size(); ++i) {
loss += net_->ForwardBackward(bottom_vec);
}
loss /= param_.iter_size();
// average the loss across iterations for smoothed reporting
if (losses.size() < average_loss) {
losses.push_back(loss);
int size = losses.size();
Expand Down Expand Up @@ -454,7 +479,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
case Caffe::CPU:
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
// Compute the value to history, and then copy them to the blob's diff.
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_rate = rate * net_params_lr[param_id]
/ this->param_.iter_size();
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

if (local_decay) {
Expand Down Expand Up @@ -490,7 +516,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
#ifndef CPU_ONLY
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
// Compute the value to history, and then copy them to the blob's diff.
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_rate = rate * net_params_lr[param_id]
/ this->param_.iter_size();
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

if (local_decay) {
Expand Down