From 0efa64c8bc69bad501fdc71e7862fc82851d5c1b Mon Sep 17 00:00:00 2001 From: zmxdream Date: Tue, 22 Feb 2022 11:11:50 +0800 Subject: [PATCH] [GPUPS]Config fleet optimize 2 (#39783) * update. test=develop * update. test=develop * fix. test=develop * update. test=develop * fix. test=develop * fix. test=develop * fix. test=develop * update. test=develop * update. test=develop --- paddle/fluid/framework/ps_gpu_trainer.cc | 160 ++++++++++++++++++++++ paddle/fluid/framework/trainer.h | 7 + paddle/fluid/framework/trainer_desc.proto | 3 + python/paddle/fluid/trainer_desc.py | 4 + 4 files changed, 174 insertions(+) diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 4d34ba85517e1..0705f658ff5fe 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include @@ -20,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) @@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, dense_grad_names_[table_id][j] = table.dense_grad_name(j); } } + // add for hbmps optimizer config + auto fleet_desc_str = trainer_desc.fleet_desc(); + google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param); + auto sparse_table = + _ps_param.server_param().downpour_server_param().downpour_table_param(0); + auto sparse_table_accessor = sparse_table.accessor(); + auto sparse_table_accessor_parameter = + sparse_table_accessor.downpour_accessor_param(); + auto accessor_class = sparse_table_accessor.accessor_class(); + // gpups' sparse table optimizer config + // now only support single sparse table + // auto sparse_table = param_.sparse_table(0); + std::unordered_map config; + if (accessor_class == "DownpourFeatureValueAccessor" || + accessor_class == "DownpourCtrAccessor" || + accessor_class == "DownpourCtrDoubleAccessor") { + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + config["learning_rate"] = + sparse_table_accessor.sparse_sgd_param().learning_rate(); + config["initial_g2sum"] = + sparse_table_accessor.sparse_sgd_param().initial_g2sum(); + config["initial_range"] = + sparse_table_accessor.sparse_sgd_param().initial_range(); + if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) { + config["min_bound"] = + sparse_table_accessor.sparse_sgd_param().weight_bounds()[0]; + config["max_bound"] = + sparse_table_accessor.sparse_sgd_param().weight_bounds()[1]; + } + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + } else if (accessor_class == "DownpourSparseValueAccessor") { + auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name(); + if (optimizer_name == "naive") { + config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .learning_rate(); + config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .initial_range(); + if (sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .learning_rate(); + config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .initial_range(); + config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .initial_g2sum(); + if (sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config["learning_rate"] = + sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate(); + config["initial_range"] = + sparse_table_accessor.sparse_commonsgd_param().adam().initial_range(); + if (sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds()[1]; + } + } + } else if (accessor_class == "DownpourUnitAccessor" || + accessor_class == "DownpourDoubleUnitAccessor") { + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name(); + if (optimizer_name == "naive") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().naive().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().naive().initial_range(); + if (sparse_table_accessor.embedx_sgd_param() + .naive() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = + sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0]; + config["mf_max_bound"] = + sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); + config["mf_initial_g2sum"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); + if (sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[0]; + config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "std_adagrad") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); + config["mf_initial_g2sum"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); + if (sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[0]; + config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adam().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adam().initial_range(); + if (sparse_table_accessor.embedx_sgd_param() + .adam() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = + sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0]; + config["mf_max_bound"] = + sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1]; + } + } + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + } + + auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); + ps_gpu_wrapper->InitializeGPUServer(config); + scale_datanorm_ = trainer_desc.scale_datanorm(); int place_num = trainer_desc.worker_places_size(); const std::vector readers = diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index c993895a9f0ea..85eef89ee27f6 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -36,6 +36,10 @@ limitations under the License. */ #include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/phi/backends/dynload/port.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif + namespace paddle { namespace framework { @@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase { int mpi_rank_; int mpi_size_; int dump_file_num_; + + // _ps_param for gpups optimizer config + ::paddle::PSParameter _ps_param; }; #endif diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 96d312437b34c..6fe33545aa22d 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -66,6 +66,9 @@ message TrainerDesc { repeated int32 trainers = 35; optional int32 trainer_id = 36; + // add for gpu + optional string fleet_desc = 37; + // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; optional DownpourWorkerParameter downpour_param = 103; diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 39320f5c0acf3..cdc9b14b6e328 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -111,6 +111,10 @@ def _set_infer(self, infer): def _set_fleet_desc(self, fleet_desc): self._fleet_desc = fleet_desc + ## serialize fleet_desc + from google.protobuf import text_format + fleet_desc_str = text_format.MessageToString(fleet_desc) + self.proto_desc.fleet_desc = fleet_desc_str def _gen_trainer_desc(self): pass