From de5977fcc4b6150c6df05499b36499f30955a54f Mon Sep 17 00:00:00 2001 From: Yulu Jia Date: Wed, 11 Aug 2021 10:12:21 -0700 Subject: [PATCH 1/3] 3rdparty: update ps-lite update to the latest ps-lite. Signed-off-by: Yulu Jia --- 3rdparty/ps-lite | 2 +- MANIFEST.in | 3 ++- byteps/common/global.cc | 6 +++--- byteps/server/server.cc | 18 +++++++++++++++--- byteps/server/server.h | 4 ++++ 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/3rdparty/ps-lite b/3rdparty/ps-lite index 28330e6567..49e4582eb2 160000 --- a/3rdparty/ps-lite +++ b/3rdparty/ps-lite @@ -1 +1 @@ -Subproject commit 28330e65672a72e07bb7317821b542dca6574356 +Subproject commit 49e4582eb242467bee57462c4aff1c846cd0eb70 diff --git a/MANIFEST.in b/MANIFEST.in index 430751d172..8628c82fe0 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,8 @@ include */* LICENSE byteps.lds byteps.exp +recursive-include * *.cc *.h prune .git prune dist -recursive-include * *.cc *.h +prune bin prune __pycache__ prune 3rdparty graft 3rdparty/ps-lite diff --git a/byteps/common/global.cc b/byteps/common/global.cc index 770c79c037..8db5bce27f 100644 --- a/byteps/common/global.cc +++ b/byteps/common/global.cc @@ -286,8 +286,8 @@ ps::KVWorker* BytePSGlobal::GetOrInitPS() { if (!_ps && IsDistributed() && _my_role == BytePSRole::LOCAL_ROOT) { // only the root needs networking // init low-level ps implementation - _ps = new ps::KVWorker(0, 0); - ps::StartAsync(0, "byteps\0"); + ps::StartPS(0, ps::Node::WORKER, -1, true, "byteps\0"); + _ps = new ps::KVWorker(0, 0, 0); if (BytePSGlobal::IsResuming() || !ps::Postoffice::Get()->is_recovery()) { ps::Postoffice::Get()->Barrier( 0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); @@ -344,7 +344,7 @@ void BytePSGlobal::Shutdown() { if (_ps) { // shutdown _ps and wait for the completion acks of other workers/servers - ps::Finalize(0, true); + ps::Finalize(0, ps::Node::WORKER, true); delete _ps; _ps = NULL; } diff --git a/byteps/server/server.cc b/byteps/server/server.cc index 7d2979b05d..d8cb381ec3 100644 --- a/byteps/server/server.cc +++ b/byteps/server/server.cc @@ -408,6 +408,18 @@ void init_global_env() { // enable to print key profile log_key_info_ = GetEnv("PS_KEY_LOG", 0); + std::string role_str = GetEnv("DMLC_ROLE", "server"); + role_ = ps::GetRole(role_str); + if (role_str == std::string("server")) { + is_server_ = true; + preferred_rank = -1; + } else { + is_server_ = false; + preferred_rank = 0; + } + + LOG(INFO) << "This is a " << role_str << " is_server=" << is_server_; + // enable engine block mode (default disabled) is_engine_blocking_ = GetEnv("BYTEPS_SERVER_ENGINE_BLOCKING", 0); if (is_engine_blocking_) @@ -480,16 +492,16 @@ extern "C" void byteps_server() { } // init server instance - byteps_server_ = new KVServer(0); + ps::StartPS(0, role_, preferred_rank, true, "byteps\0"); + byteps_server_ = new KVServer(0, false, 0); byteps_server_->set_request_handle(BytePSHandler); - StartAsync(0, "byteps_server\0"); if (!Postoffice::Get()->is_recovery()) { Postoffice::Get()->Barrier( 0, ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler); } // clean the server resource - Finalize(0, true); + Finalize(0, role_, true); if (byteps_server_) { delete byteps_server_; byteps_server_ = nullptr; diff --git a/byteps/server/server.h b/byteps/server/server.h index f24412d902..6ac49690ff 100644 --- a/byteps/server/server.h +++ b/byteps/server/server.h @@ -129,6 +129,10 @@ volatile bool sync_mode_ = true; volatile bool debug_mode_ = false; volatile bool enable_schedule_ = false; +ps::Node::Role role_; +int preferred_rank = -1; +volatile bool is_server_ = true; + // debug uint64_t debug_key_; std::mutex debug_mu_; From f81698e614182cad48add80f8f068493c3d1c445 Mon Sep 17 00:00:00 2001 From: Yulu Jia Date: Wed, 11 Aug 2021 10:42:42 -0700 Subject: [PATCH 2/3] 3rdparty: update ps-lite update to the latest ps-lite. Signed-off-by: Yulu Jia --- byteps/common/shared_memory.cc | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/byteps/common/shared_memory.cc b/byteps/common/shared_memory.cc index 570687aff7..4157147a20 100644 --- a/byteps/common/shared_memory.cc +++ b/byteps/common/shared_memory.cc @@ -31,7 +31,7 @@ void* BytePSSharedMemory::openSharedMemory(const std::string& prefix, std::string shm_name(prefix); shm_name += std::to_string(key); int shm_fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0666); - BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name; + BPS_CHECK_GE(shm_fd, 0) << "shm_open failed for " << shm_name << " " << strerror(errno); BPS_CHECK_GE(ftruncate(shm_fd, size), 0) << strerror(errno); diff --git a/setup.py b/setup.py index 7462b8550f..5b0bd850cd 100644 --- a/setup.py +++ b/setup.py @@ -642,7 +642,7 @@ def get_nccl_vals(): nccl_include_dirs += ['%s/include' % nccl_home] nccl_lib_dirs += ['%s/lib' % nccl_home, '%s/lib64' % nccl_home] - nccl_link_mode = os.environ.get('BYTEPS_NCCL_LINK', 'STATIC') + nccl_link_mode = os.environ.get('BYTEPS_NCCL_LINK', 'SHARED') if nccl_link_mode.upper() == 'SHARED': nccl_libs += ['nccl'] else: From f3edf67bc17e0aa19373117fab23b97fb701a74b Mon Sep 17 00:00:00 2001 From: Yulu Jia Date: Wed, 11 Aug 2021 11:36:57 -0700 Subject: [PATCH 3/3] 3rdparty: update ps-lite update to the latest ps-lite. Signed-off-by: Yulu Jia --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5b0bd850cd..b0da4fd6d1 100644 --- a/setup.py +++ b/setup.py @@ -892,7 +892,7 @@ def build_extensions(self): if build_ucx(): ucx_path = pre_setup.ucx_path.strip() if not ucx_path: - ucx_path = "https://codeload.github.com/openucx/ucx/zip/824c9f03" + ucx_path = "https://github.com/openucx/ucx/archive/refs/tags/v1.11.0.zip" print("ucx_path is", ucx_path) cmd = "sudo apt install -y build-essential libtool autoconf automake libnuma-dev unzip;" +\ "rm -rf ucx*;" +\