#include #include #include #include #include "hdf5.h" #include "hdf5_hl.h" #include "caffe/net.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/solver.hpp" #include "caffe/util/hdf5.hpp" #include "caffe/util/io.hpp" #include "caffe/util/math_functions.hpp" #include "caffe/util/upgrade_proto.hpp" namespace caffe { template void Solver::SetActionFunction(ActionCallback func) { action_request_function_ = func; } template SolverAction::Enum Solver::GetRequestedAction() { if (action_request_function_) { // If the external request function has been set, call it. return action_request_function_(); } return SolverAction::NONE; } template Solver::Solver(const SolverParameter& param, const Solver* root_solver) : net_(), callbacks_(), root_solver_(root_solver), requested_early_exit_(false) { Init(param); } template Solver::Solver(const string& param_file, const Solver* root_solver) : net_(), callbacks_(), root_solver_(root_solver), requested_early_exit_(false) { SolverParameter param; ReadProtoFromTextFileOrDie(param_file, ¶m); Init(param); } template void Solver::Init(const SolverParameter& param) { CHECK(Caffe::root_solver() || root_solver_) << "root_solver_ needs to be set for all non-root solvers"; LOG_IF(INFO, Caffe::root_solver()) << "Initializing solver from parameters: " << std::endl << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; if (Caffe::root_solver() && param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } // Scaffolding code InitTrainNet(); if (Caffe::root_solver()) { InitTestNets(); LOG(INFO) << "Solver scaffolding done."; } iter_ = 0; current_step_ = 0; } template void Solver::InitTrainNet() { const int num_train_nets = param_.has_net() + param_.has_net_param() + param_.has_train_net() + param_.has_train_net_param(); const string& field_names = "net, net_param, train_net, train_net_param"; CHECK_GE(num_train_nets, 1) << "SolverParameter must specify a train net " << "using one of these fields: " << field_names; CHECK_LE(num_train_nets, 1) << "SolverParameter must not contain more than " << "one of these fields specifying a train_net: " << field_names; NetParameter net_param; if (param_.has_train_net_param()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net specified in train_net_param."; net_param.CopyFrom(param_.train_net_param()); } else if (param_.has_train_net()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net from train_net file: " << param_.train_net(); ReadNetParamsFromTextFileOrDie(param_.train_net(), &net_param); } if (param_.has_net_param()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net specified in net_param."; net_param.CopyFrom(param_.net_param()); } if (param_.has_net()) { LOG_IF(INFO, Caffe::root_solver()) << "Creating training net from net file: " << param_.net(); ReadNetParamsFromTextFileOrDie(param_.net(), &net_param); } // Set the correct NetState. We start with the solver defaults (lowest // precedence); then, merge in any NetState specified by the net_param itself; // finally, merge in any NetState specified by the train_state (highest // precedence). NetState net_state; net_state.set_phase(TRAIN); net_state.MergeFrom(net_param.state()); net_state.MergeFrom(param_.train_state()); net_param.mutable_state()->CopyFrom(net_state); if (Caffe::root_solver()) { net_.reset(new Net(net_param)); } else { net_.reset(new Net(net_param, root_solver_->net_.get())); } } template void Solver::InitTestNets() { CHECK(Caffe::root_solver()); const bool has_net_param = param_.has_net_param(); const bool has_net_file = param_.has_net(); const int num_generic_nets = has_net_param + has_net_file; CHECK_LE(num_generic_nets, 1) << "Both net_param and net_file may not be specified."; const int num_test_net_params = param_.test_net_param_size(); const int num_test_net_files = param_.test_net_size(); const int num_test_nets = num_test_net_params + num_test_net_files; if (num_generic_nets) { CHECK_GE(param_.test_iter_size(), num_test_nets) << "test_iter must be specified for each test network."; } else { CHECK_EQ(param_.test_iter_size(), num_test_nets) << "test_iter must be specified for each test network."; } // If we have a generic net (specified by net or net_param, rather than // test_net or test_net_param), we may have an unlimited number of actual // test networks -- the actual number is given by the number of remaining // test_iters after any test nets specified by test_net_param and/or test_net // are evaluated. const int num_generic_net_instances = param_.test_iter_size() - num_test_nets; const int num_test_net_instances = num_test_nets + num_generic_net_instances; if (param_.test_state_size()) { CHECK_EQ(param_.test_state_size(), num_test_net_instances) << "test_state must be unspecified or specified once per test net."; } if (num_test_net_instances) { CHECK_GT(param_.test_interval(), 0); } int test_net_id = 0; vector sources(num_test_net_instances); vector net_params(num_test_net_instances); for (int i = 0; i < num_test_net_params; ++i, ++test_net_id) { sources[test_net_id] = "test_net_param"; net_params[test_net_id].CopyFrom(param_.test_net_param(i)); } for (int i = 0; i < num_test_net_files; ++i, ++test_net_id) { sources[test_net_id] = "test_net file: " + param_.test_net(i); ReadNetParamsFromTextFileOrDie(param_.test_net(i), &net_params[test_net_id]); } const int remaining_test_nets = param_.test_iter_size() - test_net_id; if (has_net_param) { for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { sources[test_net_id] = "net_param"; net_params[test_net_id].CopyFrom(param_.net_param()); } } if (has_net_file) { for (int i = 0; i < remaining_test_nets; ++i, ++test_net_id) { sources[test_net_id] = "net file: " + param_.net(); ReadNetParamsFromTextFileOrDie(param_.net(), &net_params[test_net_id]); } } test_nets_.resize(num_test_net_instances); for (int i = 0; i < num_test_net_instances; ++i) { // Set the correct NetState. We start with the solver defaults (lowest // precedence); then, merge in any NetState specified by the net_param // itself; finally, merge in any NetState specified by the test_state // (highest precedence). NetState net_state; net_state.set_phase(TEST); net_state.MergeFrom(net_params[i].state()); if (param_.test_state_size()) { net_state.MergeFrom(param_.test_state(i)); } net_params[i].mutable_state()->CopyFrom(net_state); LOG(INFO) << "Creating test net (#" << i << ") specified by " << sources[i]; if (Caffe::root_solver()) { test_nets_[i].reset(new Net(net_params[i])); } else { test_nets_[i].reset(new Net(net_params[i], root_solver_->test_nets_[i].get())); } test_nets_[i]->set_debug_info(param_.debug_info()); } } template void Solver::Step(int iters) { vector*> bottom_vec; const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); vector losses; Dtype smoothed_loss = 0; while (iter_ < stop_iter) { // zero-init the params net_->ClearParamDiffs(); if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization()) && Caffe::root_solver()) { TestAll(); if (requested_early_exit_) { // Break out of the while loop because stop was requested while testing. break; } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_start(); } const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); // accumulate the loss and gradient Dtype loss = 0; for (int i = 0; i < param_.iter_size(); ++i) { loss += net_->ForwardBackward(bottom_vec); } loss /= param_.iter_size(); // average the loss across iterations for smoothed reporting if (losses.size() < average_loss) { losses.push_back(loss); int size = losses.size(); smoothed_loss = (smoothed_loss * (size - 1) + loss) / size; } else { int idx = (iter_ - start_iter) % average_loss; smoothed_loss += (loss - losses[idx]) / average_loss; losses[idx] = loss; } if (display) { LOG_IF(INFO, Caffe::root_solver()) << "Iteration " << iter_ << ", loss = " << smoothed_loss; const vector*>& result = net_->output_blobs(); int score_index = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); const string& output_name = net_->blob_names()[net_->output_blob_indices()[j]]; const Dtype loss_weight = net_->blob_loss_weights()[net_->output_blob_indices()[j]]; for (int k = 0; k < result[j]->count(); ++k) { ostringstream loss_msg_stream; if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * result_vec[k] << " loss)"; } LOG_IF(INFO, Caffe::root_solver()) << " Train net output #" << score_index++ << ": " << output_name << " = " << result_vec[k] << loss_msg_stream.str(); } } } for (int i = 0; i < callbacks_.size(); ++i) { callbacks_[i]->on_gradients_ready(); } ApplyUpdate(); // Increment the internal iter_ counter -- its value should always indicate // the number of times the weights have been updated. ++iter_; SolverAction::Enum request = GetRequestedAction(); // Save a snapshot if needed. if ((param_.snapshot() && iter_ % param_.snapshot() == 0 && Caffe::root_solver()) || (request == SolverAction::SNAPSHOT)) { Snapshot(); } if (SolverAction::STOP == request) { requested_early_exit_ = true; // Break out of training loop. break; } } } template void Solver::Solve(const char* resume_file) { CHECK(Caffe::root_solver()); LOG(INFO) << "Solving " << net_->name(); LOG(INFO) << "Learning Rate Policy: " << param_.lr_policy(); // Initialize to false every time we start solving. requested_early_exit_ = false; if (resume_file) { LOG(INFO) << "Restoring previous solver status from " << resume_file; Restore(resume_file); } // For a network that is trained by the solver, no bottom or top vecs // should be given, and we will just provide dummy vecs. Step(param_.max_iter() - iter_); // If we haven't already, save a snapshot after optimization, unless // overridden by setting snapshot_after_train := false if (param_.snapshot_after_train() && (!param_.snapshot() || iter_ % param_.snapshot() != 0)) { Snapshot(); } if (requested_early_exit_) { LOG(INFO) << "Optimization stopped early."; return; } // After the optimization is done, run an additional train and test pass to // display the train and test loss/outputs if appropriate (based on the // display and test_interval settings, respectively). Unlike in the rest of // training, for the train net we only run a forward pass as we've already // updated the parameters "max_iter" times -- this final pass is only done to // display the loss, which is computed in the forward pass. if (param_.display() && iter_ % param_.display() == 0) { Dtype loss; net_->ForwardPrefilled(&loss); LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { TestAll(); } LOG(INFO) << "Optimization Done."; } template void Solver::TestAll() { for (int test_net_id = 0; test_net_id < test_nets_.size() && !requested_early_exit_; ++test_net_id) { Test(test_net_id); } } template void Solver::Test(const int test_net_id) { CHECK(Caffe::root_solver()); LOG(INFO) << "Iteration " << iter_ << ", Testing net (#" << test_net_id << ")"; CHECK_NOTNULL(test_nets_[test_net_id].get())-> ShareTrainedLayersWith(net_.get()); vector test_score; vector test_score_output_id; vector*> bottom_vec; const shared_ptr >& test_net = test_nets_[test_net_id]; Dtype loss = 0; for (int i = 0; i < param_.test_iter(test_net_id); ++i) { SolverAction::Enum request = GetRequestedAction(); // Check to see if stoppage of testing/training has been requested. while (request != SolverAction::NONE) { if (SolverAction::SNAPSHOT == request) { Snapshot(); } else if (SolverAction::STOP == request) { requested_early_exit_ = true; } request = GetRequestedAction(); } if (requested_early_exit_) { // break out of test loop. break; } Dtype iter_loss; const vector*>& result = test_net->Forward(bottom_vec, &iter_loss); if (param_.test_compute_loss()) { loss += iter_loss; } if (i == 0) { for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score.push_back(result_vec[k]); test_score_output_id.push_back(j); } } } else { int idx = 0; for (int j = 0; j < result.size(); ++j) { const Dtype* result_vec = result[j]->cpu_data(); for (int k = 0; k < result[j]->count(); ++k) { test_score[idx++] += result_vec[k]; } } } } if (requested_early_exit_) { LOG(INFO) << "Test interrupted."; return; } if (param_.test_compute_loss()) { loss /= param_.test_iter(test_net_id); LOG(INFO) << "Test loss: " << loss; } for (int i = 0; i < test_score.size(); ++i) { const int output_blob_index = test_net->output_blob_indices()[test_score_output_id[i]]; const string& output_name = test_net->blob_names()[output_blob_index]; const Dtype loss_weight = test_net->blob_loss_weights()[output_blob_index]; ostringstream loss_msg_stream; const Dtype mean_score = test_score[i] / param_.test_iter(test_net_id); if (loss_weight) { loss_msg_stream << " (* " << loss_weight << " = " << loss_weight * mean_score << " loss)"; } LOG(INFO) << " Test net output #" << i << ": " << output_name << " = " << mean_score << loss_msg_stream.str(); } } template void Solver::Snapshot() { CHECK(Caffe::root_solver()); string model_filename; switch (param_.snapshot_format()) { case caffe::SolverParameter_SnapshotFormat_BINARYPROTO: model_filename = SnapshotToBinaryProto(); break; case caffe::SolverParameter_SnapshotFormat_HDF5: model_filename = SnapshotToHDF5(); break; default: LOG(FATAL) << "Unsupported snapshot format."; } SnapshotSolverState(model_filename); } template string Solver::SnapshotFilename(const string extension) { string filename(param_.snapshot_prefix()); const int kBufferSize = 20; char iter_str_buffer[kBufferSize]; snprintf(iter_str_buffer, kBufferSize, "_iter_%d", iter_); return filename + iter_str_buffer + extension; } template string Solver::SnapshotToBinaryProto() { string model_filename = SnapshotFilename(".caffemodel"); LOG(INFO) << "Snapshotting to binary proto file " << model_filename; NetParameter net_param; net_->ToProto(&net_param, param_.snapshot_diff()); WriteProtoToBinaryFile(net_param, model_filename); return model_filename; } template string Solver::SnapshotToHDF5() { string model_filename = SnapshotFilename(".caffemodel.h5"); LOG(INFO) << "Snapshotting to HDF5 file " << model_filename; net_->ToHDF5(model_filename, param_.snapshot_diff()); return model_filename; } template void Solver::Restore(const char* state_file) { CHECK(Caffe::root_solver()); string state_filename(state_file); if (state_filename.size() >= 3 && state_filename.compare(state_filename.size() - 3, 3, ".h5") == 0) { RestoreSolverStateFromHDF5(state_filename); } else { RestoreSolverStateFromBinaryProto(state_filename); } } // Return the current learning rate. The currently implemented learning rate // policies are as follows: // - fixed: always return base_lr. // - step: return base_lr * gamma ^ (floor(iter / step)) // - exp: return base_lr * gamma ^ iter // - inv: return base_lr * (1 + gamma * iter) ^ (- power) // - multistep: similar to step but it allows non uniform steps defined by // stepvalue // - poly: the effective learning rate follows a polynomial decay, to be // zero by the max_iter. return base_lr (1 - iter/max_iter) ^ (power) // - sigmoid: the effective learning rate follows a sigmod decay // return base_lr ( 1/(1 + exp(-gamma * (iter - stepsize)))) // // where base_lr, max_iter, gamma, step, stepvalue and power are defined // in the solver parameter protocol buffer, and iter is the current iteration. template Dtype SGDSolver::GetLearningRate() { Dtype rate; const string& lr_policy = this->param_.lr_policy(); if (lr_policy == "fixed") { rate = this->param_.base_lr(); } else if (lr_policy == "step") { this->current_step_ = this->iter_ / this->param_.stepsize(); rate = this->param_.base_lr() * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "exp") { rate = this->param_.base_lr() * pow(this->param_.gamma(), this->iter_); } else if (lr_policy == "inv") { rate = this->param_.base_lr() * pow(Dtype(1) + this->param_.gamma() * this->iter_, - this->param_.power()); } else if (lr_policy == "multistep") { if (this->current_step_ < this->param_.stepvalue_size() && this->iter_ >= this->param_.stepvalue(this->current_step_)) { this->current_step_++; LOG(INFO) << "MultiStep Status: Iteration " << this->iter_ << ", step = " << this->current_step_; } rate = this->param_.base_lr() * pow(this->param_.gamma(), this->current_step_); } else if (lr_policy == "poly") { rate = this->param_.base_lr() * pow(Dtype(1.) - (Dtype(this->iter_) / Dtype(this->param_.max_iter())), this->param_.power()); } else if (lr_policy == "sigmoid") { rate = this->param_.base_lr() * (Dtype(1.) / (Dtype(1.) + exp(-this->param_.gamma() * (Dtype(this->iter_) - Dtype(this->param_.stepsize()))))); } else { LOG(FATAL) << "Unknown learning rate policy: " << lr_policy; } return rate; } template void SGDSolver::PreSolve() { // Initialize the history const vector*>& net_params = this->net_->learnable_params(); history_.clear(); update_.clear(); temp_.clear(); for (int i = 0; i < net_params.size(); ++i) { const vector& shape = net_params[i]->shape(); history_.push_back(shared_ptr >(new Blob(shape))); update_.push_back(shared_ptr >(new Blob(shape))); temp_.push_back(shared_ptr >(new Blob(shape))); } } template void SGDSolver::ClipGradients() { const Dtype clip_gradients = this->param_.clip_gradients(); if (clip_gradients < 0) { return; } const vector*>& net_params = this->net_->learnable_params(); Dtype sumsq_diff = 0; for (int i = 0; i < net_params.size(); ++i) { sumsq_diff += net_params[i]->sumsq_diff(); } const Dtype l2norm_diff = std::sqrt(sumsq_diff); if (l2norm_diff > clip_gradients) { Dtype scale_factor = clip_gradients / l2norm_diff; LOG(INFO) << "Gradient clipping: scaling down gradients (L2 norm " << l2norm_diff << " > " << clip_gradients << ") " << "by scale factor " << scale_factor; for (int i = 0; i < net_params.size(); ++i) { net_params[i]->scale_diff(scale_factor); } } } template void SGDSolver::ApplyUpdate() { CHECK(Caffe::root_solver()); Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { LOG(INFO) << "Iteration " << this->iter_ << ", lr = " << rate; } ClipGradients(); for (int param_id = 0; param_id < this->net_->learnable_params().size(); ++param_id) { Normalize(param_id); Regularize(param_id); ComputeUpdateValue(param_id, rate); } this->net_->Update(); } template void SGDSolver::Normalize(int param_id) { if (this->param_.iter_size() == 1) { return; } // Scale gradient to counterbalance accumulation. const vector*>& net_params = this->net_->learnable_params(); const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size(); switch (Caffe::mode()) { case Caffe::CPU: { caffe_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_scal(net_params[param_id]->count(), accum_normalization, net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void SGDSolver::Regularize(int param_id) { const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_weight_decay = this->net_->params_weight_decay(); Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else if (regularization_type == "L1") { caffe_cpu_sign(net_params[param_id]->count(), net_params[param_id]->cpu_data(), temp_[param_id]->mutable_cpu_data()); caffe_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } break; } case Caffe::GPU: { #ifndef CPU_ONLY if (local_decay) { if (regularization_type == "L2") { // add weight decay caffe_gpu_axpy(net_params[param_id]->count(), local_decay, net_params[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else if (regularization_type == "L1") { caffe_gpu_sign(net_params[param_id]->count(), net_params[param_id]->gpu_data(), temp_[param_id]->mutable_gpu_data()); caffe_gpu_axpy(net_params[param_id]->count(), local_decay, temp_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); } else { LOG(FATAL) << "Unknown regularization type: " << regularization_type; } } #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void SGDSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; // Compute the update to history, then copy it to the parameter diff. switch (Caffe::mode()) { case Caffe::CPU: { caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, history_[param_id]->mutable_cpu_data()); caffe_copy(net_params[param_id]->count(), history_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY caffe_gpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), momentum, history_[param_id]->mutable_gpu_data()); caffe_copy(net_params[param_id]->count(), history_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void SGDSolver::SnapshotSolverState(const string& model_filename) { switch (this->param_.snapshot_format()) { case caffe::SolverParameter_SnapshotFormat_BINARYPROTO: SnapshotSolverStateToBinaryProto(model_filename); break; case caffe::SolverParameter_SnapshotFormat_HDF5: SnapshotSolverStateToHDF5(model_filename); break; default: LOG(FATAL) << "Unsupported snapshot format."; } } template void SGDSolver::SnapshotSolverStateToBinaryProto( const string& model_filename) { SolverState state; state.set_iter(this->iter_); state.set_learned_net(model_filename); state.set_current_step(this->current_step_); state.clear_history(); for (int i = 0; i < history_.size(); ++i) { // Add history BlobProto* history_blob = state.add_history(); history_[i]->ToProto(history_blob); } string snapshot_filename = Solver::SnapshotFilename(".solverstate"); LOG(INFO) << "Snapshotting solver state to binary proto file" << snapshot_filename; WriteProtoToBinaryFile(state, snapshot_filename.c_str()); } template void SGDSolver::SnapshotSolverStateToHDF5( const string& model_filename) { string snapshot_filename = Solver::SnapshotFilename(".solverstate.h5"); LOG(INFO) << "Snapshotting solver state to HDF5 file " << snapshot_filename; hid_t file_hid = H5Fcreate(snapshot_filename.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); CHECK_GE(file_hid, 0) << "Couldn't open " << snapshot_filename << " to save solver state."; hdf5_save_int(file_hid, "iter", this->iter_); hdf5_save_string(file_hid, "learned_net", model_filename); hdf5_save_int(file_hid, "current_step", this->current_step_); hid_t history_hid = H5Gcreate2(file_hid, "history", H5P_DEFAULT, H5P_DEFAULT, H5P_DEFAULT); CHECK_GE(history_hid, 0) << "Error saving solver state to " << snapshot_filename << "."; for (int i = 0; i < history_.size(); ++i) { ostringstream oss; oss << i; hdf5_save_nd_dataset(history_hid, oss.str(), *history_[i]); } H5Gclose(history_hid); H5Fclose(file_hid); } template void SGDSolver::RestoreSolverStateFromBinaryProto( const string& state_file) { SolverState state; ReadProtoFromBinaryFile(state_file, &state); this->iter_ = state.iter(); if (state.has_learned_net()) { NetParameter net_param; ReadNetParamsFromBinaryFileOrDie(state.learned_net().c_str(), &net_param); this->net_->CopyTrainedLayersFrom(net_param); } this->current_step_ = state.current_step(); CHECK_EQ(state.history_size(), history_.size()) << "Incorrect length of history blobs."; LOG(INFO) << "SGDSolver: restoring history"; for (int i = 0; i < history_.size(); ++i) { history_[i]->FromProto(state.history(i)); } } template void SGDSolver::RestoreSolverStateFromHDF5(const string& state_file) { hid_t file_hid = H5Fopen(state_file.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); CHECK_GE(file_hid, 0) << "Couldn't open solver state file " << state_file; this->iter_ = hdf5_load_int(file_hid, "iter"); if (H5LTfind_dataset(file_hid, "learned_net")) { string learned_net = hdf5_load_string(file_hid, "learned_net"); this->net_->CopyTrainedLayersFrom(learned_net); } this->current_step_ = hdf5_load_int(file_hid, "current_step"); hid_t history_hid = H5Gopen2(file_hid, "history", H5P_DEFAULT); CHECK_GE(history_hid, 0) << "Error reading history from " << state_file; int state_history_size = hdf5_get_num_links(history_hid); CHECK_EQ(state_history_size, history_.size()) << "Incorrect length of history blobs."; for (int i = 0; i < history_.size(); ++i) { ostringstream oss; oss << i; hdf5_load_nd_dataset(history_hid, oss.str().c_str(), 0, kMaxBlobAxes, history_[i].get()); } H5Gclose(history_hid); H5Fclose(file_hid); } template void NesterovSolver::ComputeUpdateValue(int param_id, Dtype rate) { CHECK(Caffe::root_solver()); const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { // save history momentum for stepping back caffe_copy(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // update history caffe_cpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), momentum, this->history_[param_id]->mutable_cpu_data()); // compute update: step back then over step caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, this->history_[param_id]->cpu_data(), -momentum, this->update_[param_id]->mutable_cpu_data()); // copy caffe_copy(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // save history momentum for stepping back caffe_copy(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_axpby(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), momentum, this->history_[param_id]->mutable_gpu_data()); // compute update: step back then over step caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) + momentum, this->history_[param_id]->gpu_data(), -momentum, this->update_[param_id]->mutable_gpu_data()); // copy caffe_copy(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void AdaGradSolver::ComputeUpdateValue(int param_id, Dtype rate) { CHECK(Caffe::root_solver()); const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: { // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history caffe_add(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->history_[param_id]->mutable_cpu_data()); // prepare update caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data()); caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_add(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->history_[param_id]->mutable_gpu_data()); // prepare update caffe_gpu_powx(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // scale and copy caffe_gpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->gpu_data(), Dtype(0), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void RMSPropSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); // get the learning rate Dtype delta = this->param_.delta(); Dtype rms_decay = this->param_.rms_decay(); Dtype local_rate = rate * net_params_lr[param_id]; switch (Caffe::mode()) { case Caffe::CPU: // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history caffe_cpu_axpby(net_params[param_id] -> count(), Dtype(1-rms_decay), this->update_[param_id]->cpu_data(), rms_decay, this->history_[param_id]-> mutable_cpu_data()); // prepare update caffe_powx(net_params[param_id]->count(), this->history_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); caffe_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_cpu_data()); caffe_div(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // scale and copy caffe_cpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->cpu_data(), Dtype(0), net_params[param_id]->mutable_cpu_diff()); break; case Caffe::GPU: #ifndef CPU_ONLY // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history caffe_gpu_axpby(net_params[param_id] -> count(), Dtype(1-rms_decay), this->update_[param_id]->gpu_data(), rms_decay, this->history_[param_id]-> mutable_gpu_data()); // prepare update caffe_gpu_powx(net_params[param_id]->count(), this->history_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add_scalar(net_params[param_id]->count(), delta, this->update_[param_id]->mutable_gpu_data()); caffe_gpu_div(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_axpby(net_params[param_id]->count(), local_rate, this->update_[param_id]->gpu_data(), Dtype(0), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void AdaDeltaSolver::AdaDeltaPreSolve() { // Add the extra history entries for AdaDelta after those from // SGDSolver::PreSolve const vector*>& net_params = this->net_->learnable_params(); for (int i = 0; i < net_params.size(); ++i) { const vector& shape = net_params[i]->shape(); this->history_.push_back( shared_ptr >(new Blob(shape))); } } template void AdaDeltaSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype momentum = this->param_.momentum(); Dtype local_rate = rate * net_params_lr[param_id]; size_t update_history_offset = net_params.size(); switch (Caffe::mode()) { case Caffe::CPU: { // compute square of gradient in update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history of gradients caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->cpu_data(), momentum, this->history_[param_id]->mutable_cpu_data()); // add delta to history to guard against dividing by zero later caffe_set(net_params[param_id]->count(), delta, this->temp_[param_id]->mutable_cpu_data()); caffe_add(net_params[param_id]->count(), this->temp_[param_id]->cpu_data(), this->history_[update_history_offset + param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); caffe_add(net_params[param_id]->count(), this->temp_[param_id]->cpu_data(), this->history_[param_id]->cpu_data(), this->temp_[param_id]->mutable_cpu_data()); // divide history of updates by history of gradients caffe_div(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), this->temp_[param_id]->cpu_data(), this->update_[param_id]->mutable_cpu_data()); // jointly compute the RMS of both for update and gradient history caffe_powx(net_params[param_id]->count(), this->update_[param_id]->cpu_data(), Dtype(0.5), this->update_[param_id]->mutable_cpu_data()); // compute the update caffe_mul(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), this->update_[param_id]->cpu_data(), net_params[param_id]->mutable_cpu_diff()); // compute square of update caffe_powx(net_params[param_id]->count(), net_params[param_id]->cpu_diff(), Dtype(2), this->update_[param_id]->mutable_cpu_data()); // update history of updates caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->cpu_data(), momentum, this->history_[update_history_offset + param_id]->mutable_cpu_data()); // apply learning rate caffe_cpu_scale(net_params[param_id]->count(), local_rate, net_params[param_id]->cpu_diff(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // compute square of gradient in update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history of gradients caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->gpu_data(), momentum, this->history_[param_id]->mutable_gpu_data()); // add delta to history to guard against dividing by zero later caffe_gpu_set(net_params[param_id]->count(), delta, this->temp_[param_id]->mutable_gpu_data()); caffe_gpu_add(net_params[param_id]->count(), this->temp_[param_id]->gpu_data(), this->history_[update_history_offset + param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); caffe_gpu_add(net_params[param_id]->count(), this->temp_[param_id]->gpu_data(), this->history_[param_id]->gpu_data(), this->temp_[param_id]->mutable_gpu_data()); // divide history of updates by history of gradients caffe_gpu_div(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), this->temp_[param_id]->gpu_data(), this->update_[param_id]->mutable_gpu_data()); // jointly compute the RMS of both for update and gradient history caffe_gpu_powx(net_params[param_id]->count(), this->update_[param_id]->gpu_data(), Dtype(0.5), this->update_[param_id]->mutable_gpu_data()); // compute the update and copy to net_diff caffe_gpu_mul(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), this->update_[param_id]->gpu_data(), net_params[param_id]->mutable_gpu_diff()); // compute square of update caffe_gpu_powx(net_params[param_id]->count(), net_params[param_id]->gpu_diff(), Dtype(2), this->update_[param_id]->mutable_gpu_data()); // update history of updates caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, this->update_[param_id]->gpu_data(), momentum, this->history_[update_history_offset + param_id]->mutable_gpu_data()); // apply learning rate caffe_gpu_scale(net_params[param_id]->count(), local_rate, net_params[param_id]->gpu_diff(), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } template void AdamSolver::AdamPreSolve() { // Add the extra history entries for Adam after those from // SGDSolver::PreSolve const vector*>& net_params = this->net_->learnable_params(); for (int i = 0; i < net_params.size(); ++i) { const vector& shape = net_params[i]->shape(); this->history_.push_back( shared_ptr >(new Blob(shape))); } } template void AdamSolver::ComputeUpdateValue(int param_id, Dtype rate) { const vector*>& net_params = this->net_->learnable_params(); const vector& net_params_lr = this->net_->params_lr(); Dtype local_rate = rate * net_params_lr[param_id]; const Dtype beta1 = this->param_.momentum(); const Dtype beta2 = this->param_.momentum2(); // we create aliases for convenience size_t update_history_offset = net_params.size(); Blob* val_m = this->history_[param_id].get(); Blob* val_v = this->history_[param_id + update_history_offset].get(); Blob* val_t = this->temp_[param_id].get(); const int t = this->iter_ + 1; const Dtype correction = std::sqrt(Dtype(1) - pow(beta2, t)) / (Dtype(1.) - pow(beta1, t)); const int N = net_params[param_id]->count(); const Dtype eps_hat = this->param_.delta(); switch (Caffe::mode()) { case Caffe::CPU: { // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t caffe_cpu_axpby(N, Dtype(1)-beta1, net_params[param_id]->cpu_diff(), beta1, val_m->mutable_cpu_data()); // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2 caffe_mul(N, net_params[param_id]->cpu_diff(), net_params[param_id]->cpu_diff(), val_t->mutable_cpu_data()); caffe_cpu_axpby(N, Dtype(1)-beta2, val_t->cpu_data(), beta2, val_v->mutable_cpu_data()); // set update caffe_powx(N, val_v->cpu_data(), Dtype(0.5), val_t->mutable_cpu_data()); caffe_add_scalar(N, eps_hat, val_t->mutable_cpu_data()); caffe_div(N, val_m->cpu_data(), val_t->cpu_data(), val_t->mutable_cpu_data()); caffe_cpu_scale(N, local_rate*correction, val_t->cpu_data(), net_params[param_id]->mutable_cpu_diff()); break; } case Caffe::GPU: { #ifndef CPU_ONLY // update m <- \beta_1 m_{t-1} + (1-\beta_1)g_t caffe_gpu_axpby(N, Dtype(1)-beta1, net_params[param_id]->gpu_diff(), beta1, val_m->mutable_gpu_data()); // update v <- \beta_2 m_{t-1} + (1-\beta_2)g_t^2 caffe_gpu_mul(N, net_params[param_id]->gpu_diff(), net_params[param_id]->gpu_diff(), val_t->mutable_gpu_data()); caffe_gpu_axpby(N, Dtype(1)-beta2, val_t->gpu_data(), beta2, val_v->mutable_gpu_data()); // set update caffe_gpu_powx(N, val_v->gpu_data(), Dtype(0.5), val_t->mutable_gpu_data()); caffe_gpu_add_scalar(N, eps_hat, val_t->mutable_gpu_data()); caffe_gpu_div(N, val_m->gpu_data(), val_t->gpu_data(), val_t->mutable_gpu_data()); caffe_gpu_scale(N, local_rate*correction, val_t->gpu_data(), net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } } INSTANTIATE_CLASS(Solver); INSTANTIATE_CLASS(SGDSolver); INSTANTIATE_CLASS(NesterovSolver); INSTANTIATE_CLASS(AdaGradSolver); INSTANTIATE_CLASS(RMSPropSolver); INSTANTIATE_CLASS(AdaDeltaSolver); INSTANTIATE_CLASS(AdamSolver); } // namespace caffe