Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adadelta #1122

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions examples/mnist/lenet_adadelta_solver.prototxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# The train/test net protocol buffer definition
net: "examples/mnist/lenet_train_test.prototxt"
# test_iter specifies how many forward passes the test should carry out.
# In the case of MNIST, we have test batch size 100 and 100 test iterations,
# covering the full 10,000 testing images.
test_iter: 100
# Carry out testing every 500 training iterations.
test_interval: 500
# The base learning rate, momentum and the weight decay of the network.
momentum: 0.95
weight_decay: 0.0005
# Display every 100 iterations
display: 100
# The maximum number of iterations
max_iter: 10000
# snapshot intermediate results
snapshot: 5000
snapshot_prefix: "examples/mnist/lenet_adadelta"
# solver mode: CPU or GPU
solver_mode: GPU
solver_type: ADADELTA
delta: 1e-6
17 changes: 17 additions & 0 deletions examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
net: "examples/mnist/mnist_autoencoder.prototxt"
test_state: { stage: 'test-on-train' }
test_iter: 500
test_state: { stage: 'test-on-test' }
test_iter: 100
test_interval: 500
test_compute_loss: true
momentum: 0.95
display: 100
max_iter: 65000
weight_decay: 0.0005
snapshot: 10000
snapshot_prefix: "examples/mnist/mnist_autoencoder_adadelta_train"
# solver mode: CPU or GPU
solver_mode: GPU
solver_type: ADADELTA
delta: 1e-8
4 changes: 4 additions & 0 deletions examples/mnist/train_mnist_autoencoder_adadelta.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

./build/tools/caffe train \
--solver=examples/mnist/mnist_autoencoder_solver_adadelta.prototxt
23 changes: 23 additions & 0 deletions include/caffe/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,27 @@ class AdaGradSolver : public SGDSolver<Dtype> {
DISABLE_COPY_AND_ASSIGN(AdaGradSolver);
};

template <typename Dtype>
class AdaDeltaSolver : public SGDSolver<Dtype> {
public:
explicit AdaDeltaSolver(const SolverParameter& param)
: SGDSolver<Dtype>(param) { constructor_sanity_check(); }
explicit AdaDeltaSolver(const string& param_file)
: SGDSolver<Dtype>(param_file) { constructor_sanity_check(); }

protected:
virtual void PreSolve();
virtual void ComputeUpdateValue();
void constructor_sanity_check() {
CHECK_EQ(0, this->param_.base_lr())
<< "Learning rate cannot be used with AdaDelta.";
CHECK_EQ("", this->param_.lr_policy())
<< "Learning rate policy cannot be applied to AdaDelta.";
}

DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver);
};

template <typename Dtype>
Solver<Dtype>* GetSolver(const SolverParameter& param) {
SolverParameter_SolverType type = param.solver_type();
Expand All @@ -135,6 +156,8 @@ Solver<Dtype>* GetSolver(const SolverParameter& param) {
return new NesterovSolver<Dtype>(param);
case SolverParameter_SolverType_ADAGRAD:
return new AdaGradSolver<Dtype>(param);
case SolverParameter_SolverType_ADADELTA:
return new AdaDeltaSolver<Dtype>(param);
default:
LOG(FATAL) << "Unknown SolverType: " << type;
}
Expand Down
1 change: 1 addition & 0 deletions src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ message SolverParameter {
SGD = 0;
NESTEROV = 1;
ADAGRAD = 2;
ADADELTA = 3;
}
optional SolverType solver_type = 30 [default = SGD];
// numerical stability for AdaGrad
Expand Down
199 changes: 199 additions & 0 deletions src/caffe/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -753,9 +753,208 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
}
}

template <typename Dtype>
void AdaDeltaSolver<Dtype>::PreSolve() {
// Initialize the history
vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
this->history_.clear();
this->update_.clear();
this->temp_.clear();
for (int i = 0; i < net_params.size(); ++i) {
const Blob<Dtype>* net_param = net_params[i].get();
this->history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
this->update_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
this->temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
}
for (int i = 0; i < net_params.size(); ++i) {
const Blob<Dtype>* net_param = net_params[i].get();
this->history_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
}
}

template <typename Dtype>
void AdaDeltaSolver<Dtype>::ComputeUpdateValue() {
vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
vector<float>& net_params_weight_decay = this->net_->params_weight_decay();
Dtype delta = this->param_.delta();
Dtype momentum = this->param_.momentum();
Dtype weight_decay = this->param_.weight_decay();
string regularization_type = this->param_.regularization_type();
size_t update_history_offset = net_params.size();
switch (Caffe::mode()) {
case Caffe::CPU:
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

if (local_decay) {
if (regularization_type == "L2") {
// add weight decay
caffe_axpy(net_params[param_id]->count(),
local_decay,
net_params[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
} else if (regularization_type == "L1") {
caffe_cpu_sign(net_params[param_id]->count(),
net_params[param_id]->cpu_data(),
this->temp_[param_id]->mutable_cpu_data());
caffe_axpy(net_params[param_id]->count(),
local_decay,
this->temp_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
} else {
LOG(FATAL) << "Unknown regularization type: " << regularization_type;
}
}

// compute square of gradient in update
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());

// update history of gradients
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[param_id]->mutable_cpu_data());

// add delta to history to guard against dividing by zero later
caffe_set(net_params[param_id]->count(), delta,
this->temp_[param_id]->mutable_cpu_data());

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[update_history_offset + param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

caffe_add(net_params[param_id]->count(),
this->temp_[param_id]->cpu_data(),
this->history_[param_id]->cpu_data(),
this->temp_[param_id]->mutable_cpu_data());

// divide history of updates by history of gradients
caffe_div(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(),
this->temp_[param_id]->cpu_data(),
this->update_[param_id]->mutable_cpu_data());

// jointly compute the RMS of both for update and gradient history
caffe_powx(net_params[param_id]->count(),
this->update_[param_id]->cpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_cpu_data());

// compute the update
caffe_mul(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(),
this->update_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());

// compute square of update
caffe_powx(net_params[param_id]->count(),
net_params[param_id]->cpu_diff(), Dtype(2),
this->update_[param_id]->mutable_cpu_data());

// update history of updates
caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->cpu_data(), momentum,
this->history_[update_history_offset + param_id]->mutable_cpu_data());
}
break;
case Caffe::GPU:
#ifndef CPU_ONLY
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

if (local_decay) {
if (regularization_type == "L2") {
// add weight decay
caffe_gpu_axpy(net_params[param_id]->count(),
local_decay,
net_params[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());
} else if (regularization_type == "L1") {
caffe_gpu_sign(net_params[param_id]->count(),
net_params[param_id]->gpu_data(),
this->temp_[param_id]->mutable_gpu_data());
caffe_gpu_axpy(net_params[param_id]->count(),
local_decay,
this->temp_[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());
} else {
LOG(FATAL) << "Unknown regularization type: " << regularization_type;
}
}

// compute square of gradient in update
caffe_gpu_powx(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(), Dtype(2),
this->update_[param_id]->mutable_gpu_data());

// update history of gradients
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->gpu_data(), momentum,
this->history_[param_id]->mutable_gpu_data());

// add delta to history to guard against dividing by zero later
caffe_gpu_set(net_params[param_id]->count(), delta,
this->temp_[param_id]->mutable_gpu_data());

caffe_gpu_add(net_params[param_id]->count(),
this->temp_[param_id]->gpu_data(),
this->history_[update_history_offset + param_id]->gpu_data(),
this->update_[param_id]->mutable_gpu_data());

caffe_gpu_add(net_params[param_id]->count(),
this->temp_[param_id]->gpu_data(),
this->history_[param_id]->gpu_data(),
this->temp_[param_id]->mutable_gpu_data());

// divide history of updates by history of gradients
caffe_gpu_div(net_params[param_id]->count(),
this->update_[param_id]->gpu_data(),
this->temp_[param_id]->gpu_data(),
this->update_[param_id]->mutable_gpu_data());

// jointly compute the RMS of both for update and gradient history
caffe_gpu_powx(net_params[param_id]->count(),
this->update_[param_id]->gpu_data(), Dtype(0.5),
this->update_[param_id]->mutable_gpu_data());

// compute the update and copy to net_diff
caffe_gpu_mul(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(),
this->update_[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());

// compute square of update
caffe_gpu_powx(net_params[param_id]->count(),
net_params[param_id]->gpu_diff(), Dtype(2),
this->update_[param_id]->mutable_gpu_data());

// update history of updates
caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum,
this->update_[param_id]->gpu_data(), momentum,
this->history_[update_history_offset + param_id]->mutable_gpu_data());
}
#else
NO_GPU;
#endif
break;
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}

INSTANTIATE_CLASS(Solver);
INSTANTIATE_CLASS(SGDSolver);
INSTANTIATE_CLASS(NesterovSolver);
INSTANTIATE_CLASS(AdaGradSolver);
INSTANTIATE_CLASS(AdaDeltaSolver);

} // namespace caffe
Loading