diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index f46c5bf7ac09d..ed98ca60e4d83 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,6 +21,11 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + if(WITH_ASCEND) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) @@ -43,9 +48,7 @@ if(WITH_ASCEND) set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) - endif() + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 665e7b2fcf8bc..c5fe08ab78224 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type, // framework::OpRegistry::CreateOp(type, {}, {}, {}, false). // Inputs, outputs and attrs will be set to empty map // to improve the execution efficiency of dygraph. + if (inputs_.size() > 0 || outputs_.size() > 0) { GenerateTemporaryNames(); CheckAllInputOutputSet(); diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 2fd4de5cfcba4..e2c4bdd978e8f 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -31,6 +31,11 @@ #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -45,6 +50,10 @@ class Communicator; class NCCLCommunicator; #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +class Communicator; +class HCCLCommunicator; +#endif #if defined(PADDLE_WITH_XPU_BKCL) class BKCLCommunicator; @@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #endif operators::CudnnRNNCache, #endif +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo, +#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6fe18f2479478..2d3550f8f0636 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -159,7 +159,6 @@ endif() if (WITH_ASCEND_CL) cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) - cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op) cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op) endif() diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 46f9f7ff08944..3db458050255e 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -61,7 +61,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { size_t x_size = xs.size(); for (size_t i = 0; i < x_size; ++i) { - found_inf_data = true; const auto* x = xs[i]; auto* out = outs[i]; out->mutable_data(ctx.GetPlace()); @@ -77,6 +76,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { NpuOpRunner("CheckNumerics", {*x}, {check_xout}, {{"message", std::string("check_nan_and_inf")}}); runner_checknumerics.Run(stream); + ctx.template device_context() + .Wait(); } catch (platform::EnforceNotMet& exception) { LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; found_inf_data = true; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc index 99e81a4757d0e..1ed188b15939e 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -110,10 +110,10 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) { // out found_inf Tensor found_inf_tensor; found_inf_tensor.Resize({1}); - bool *is_finite_data = + bool *found_inf_data = found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); f::TensorCopy(*found_inf, place, &found_inf_tensor); - EXPECT_FALSE(*is_finite_data); + EXPECT_TRUE(*found_inf_data); ctx.Wait(); } diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 93caa5354dbc4..1a8ac46b637f4 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -11,7 +11,7 @@ foreach(src ${OPS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) endforeach() -register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) +register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) if(WITH_NCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) @@ -24,39 +24,43 @@ if(WITH_GLOO) endif() if(WITH_XPU_BKCL) - set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common) op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() if(WITH_ASCEND_CL) -set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper) + cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper) + op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") if(WITH_ASCEND_CL) - set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags + set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper + gen_hccl_id_op op_registry ascend_hccl flags dynamic_loader dynload_warpctc scope device_context enforce executor) cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc - DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc - DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc + DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc + DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) - cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc - DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) - cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc + cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc + DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) endif() diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc index ea6caf954d14c..f142367e9f1be 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel { #if defined(PADDLE_WITH_ASCEND_CL) auto in = ctx.Input("X"); auto out = ctx.Output("Out"); - hcclDataType_t dtype = platform::ToHCCLDataType(in->type()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); int ring_id = ctx.Attr("ring_id"); std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); auto place = ctx.GetPlace(); auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); int nranks = comm->nranks(); - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); framework::DDim out_dims = in->dims(); out_dims[0] *= nranks; out->mutable_data(out_dims, place); - int64_t send_numel = in->numel(); + uint64_t send_numel = in->numel(); void *send_buff = reinterpret_cast(const_cast(in->data())); void *recv_buff = reinterpret_cast(out->data()); @@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel { VLOG(3) << "begin hccl allgather, parameter is: " << ", group is " << group << ", ring_id is " << ring_id - << ", nranks is " << nranks - << ", tag is " << tag; + << ", nranks is " << nranks; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather( - tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype, - group.c_str(), (void*)stream)); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather( + send_buff, recv_buff, send_numel, dtype, + comm->comm(), (void*)stream)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index 38f19170af958..922bfe868ebbb 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -45,7 +46,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_allgather); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_allgather, NPU); DECLARE_string(selected_npus); @@ -56,28 +58,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ for (auto ele : data) { debugstring += std::to_string(ele) + std::string(","); } - VLOG(2) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); @@ -85,12 +127,12 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); std::vector init; int rank_id = atoi(getenv("RANK_ID")); - + int num1 = 1; int num2 = 4; @@ -104,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); tensor_out->mutable_data(place); // allocate @@ -115,15 +157,15 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { attrs["tag"]=std::string("tagx"); attrs["ring_id"]=0; attrs["nranks"]=2; - - auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}}, - {{"Out", {"Out"}}}, attrs); + + auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); for (int i = 0; i < 10; i ++) { op->Run(*scope, place); } ctx.Wait(); - + std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); @@ -142,10 +184,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { TEST(c_allgather, NPU) { f::Scope scope; + HcclRootInfo hccl_id; - // only support one device, if more than one device, use first default + // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); TestHCCLAllGatherOp(&scope, ctx); } diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 3fdc859506754..14f1cb4dcbda5 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -45,7 +46,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_allreduce_max); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU); DECLARE_string(selected_npus); @@ -59,7 +61,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ VLOG(2) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); @@ -85,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); std::vector init; @@ -104,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); tensor_out->mutable_data(place); // allocate @@ -115,8 +157,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { attrs["tag"]=std::string("tagx"); attrs["ring_id"]=0; - auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}}, - {{"Out", {"Out"}}}, attrs); + auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); for (int i = 0; i < 10; i ++) { op->Run(*scope, place); @@ -137,10 +179,12 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { TEST(c_allreduce_max, NPU) { f::Scope scope; + HcclRootInfo hccl_id; // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); TestHCCLAllReduceOp(&scope, ctx); } diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 0011d9e9ad83a..a770096e43110 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -117,34 +117,18 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - - // we need to pre-allocate 512 Bytes before the data - // and 512 Bytes after the data, so the hccl allreduce - // can work. This is a must acooding to huawei peer. - #define PRE_MALLOC_SIZE_BYTES 512 - auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - hcclDataType_t dtype = platform::ToHCCLDataType(in->type()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); int64_t numel = in->numel(); - int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T); - int64_t tmp_numel = numel + pre_tmp_size * 2; - - paddle::framework::LoDTensor tmp_in, tmp_out; - tmp_in.Resize({tmp_numel}); - tmp_out.Resize({tmp_numel}); - auto p_tmp_in = tmp_in.mutable_data(place); // allocate - auto p_tmp_out = tmp_out.mutable_data(place); // allocate - - void* sendbuff = reinterpret_cast(tmp_in.data() + pre_tmp_size); - void* recvbuff = reinterpret_cast(tmp_out.data() + pre_tmp_size); + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); int ring_id = ctx.Attr("ring_id"); std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); aclrtStream stream = nullptr; auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); @@ -154,33 +138,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { stream = comm->stream(); } - // we need to memset this memory firstly to avoid core by hccl - platform::NPUMemsetAsync(static_cast(p_tmp_in), 0, tmp_numel*sizeof(T), stream); - platform::NPUMemsetAsync(static_cast(p_tmp_out), 0, tmp_numel*sizeof(T), stream); - - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); - - memory::Copy(npu_place, sendbuff, - npu_place, reinterpret_cast(const_cast(in->data())), - numel * sizeof(T), - stream); - - hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM; + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; switch (red_type) { case kRedSum: - hccl_red_type = HCCL_REP_OP_SUM; + hccl_red_type = HCCL_REDUCE_SUM; break; case kRedMax: - hccl_red_type = HCCL_REP_OP_MAX; + hccl_red_type = HCCL_REDUCE_MAX; break; case kRedMin: - hccl_red_type = HCCL_REP_OP_MIN; + hccl_red_type = HCCL_REDUCE_MIN; break; case kRedProd: - hccl_red_type = HCCL_REP_OP_PROD; + hccl_red_type = HCCL_REDUCE_PROD; break; default: @@ -192,16 +165,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { << "input num: " << numel << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type - << ", group is: " << group - << ", tag is " << tag; - - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce( - tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream)); + << ", group is: " << group; - memory::Copy(npu_place, reinterpret_cast(out->data()), - npu_place, recvbuff, - numel * sizeof(T), - stream); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream)); out->Resize(in->dims()); #else diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 2fff84593c008..600d85743283b 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -42,7 +43,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_allreduce_sum); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); DECLARE_string(selected_npus); @@ -56,7 +58,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ VLOG(3) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); @@ -82,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); int rank_id = atoi(getenv("RANK_ID")); @@ -101,7 +143,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) tensor_x->Resize({num1, num2}); ctx.Wait(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); tensor_out->mutable_data(place); // allocate @@ -113,8 +155,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) attrs["ring_id"]=0; auto op = f::OpRegistry::CreateOp("c_allreduce_sum", - {{"X", {"X"}}}, - {{"Out", {"Out"}}}, + {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); for (int i = 0; i < 10; i ++) { @@ -136,11 +178,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) TEST(c_allreduce_sum, NPU) { f::Scope scope; + HcclRootInfo hccl_id; // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); for(int i = 0; i < 1; i ++){ VLOG(2) << "iter num: " << i; TestHCCLAllReduceOp(&scope, ctx, i); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc index 67410a1c70426..9dd9a31610985 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc @@ -30,7 +30,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel { auto x = ctx.Input("X"); void *ptr = reinterpret_cast(const_cast(x->data())); int numel = x->numel(); - hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); auto out = ctx.Output("Out"); @@ -48,14 +48,12 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel { int root = ctx.Attr("root"); std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root - << ", group is " << group - << ", tag is " << tag; + << ", group is " << group << ", comm: " << comm->comm() << ", stream: " << stream; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel, - dtype, (uint32_t)root, group.c_str(), (void*)stream)); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel, + dtype, (uint32_t)root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " << framework::product(out->dims()); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 66158e5ff28ae..6e82e6c0bfde7 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -42,7 +43,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_broadcast); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_broadcast, NPU); DECLARE_string(selected_npus); @@ -53,28 +55,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ for (auto ele : data) { debugstring += std::to_string(ele) + std::string(","); } - VLOG(2) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); @@ -82,12 +124,12 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); int num = 2; std::vector init; int rank_id = atoi(getenv("RANK_ID")); - + for (int64_t i = 0; i < num * num; ++i) { init.push_back(1.0 + rank_id); } @@ -98,7 +140,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num, num}); tensor_out->mutable_data(place); // allocate @@ -110,14 +152,14 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { attrs["root"]=0; attrs["ring_id"]=0; - auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}}, - {{"Out", {"Out"}}}, attrs); + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); for (int i = 0; i < 10; i ++) { op->Run(*scope, place); } ctx.Wait(); - + std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); @@ -131,10 +173,11 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { TEST(c_broadcast, NPU) { f::Scope scope; - - // only support one device, if more than one device, use first default + HcclRootInfo hccl_id; + // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); TestHCCLBroadcastOp(&scope, ctx); } diff --git a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc similarity index 62% rename from paddle/fluid/operators/collective/c_comm_init_hcom_op.cc rename to paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index f720ffdd0fe8c..1677ac5bcb01d 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,66 +12,67 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_ASCEND_CL) -#include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/hccl_helper.h" - #include -#include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace framework { class Scope; } // namespace framework } // namespace paddle +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif namespace paddle { namespace operators { -class CCommInitOpNPU : public framework::OperatorBase { +class CCommInitOpAscend : public framework::OperatorBase { public: - CCommInitOpNPU(const std::string& type, - const framework::VariableNameMap& inputs, + CCommInitOpAscend(const std::string& type, const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, const framework::AttributeMap& attrs) : OperatorBase(type, inputs, outputs, attrs) {} void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - int rid = Attr("ring_id"); - int nranks = Attr("nranks"); + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "CCommInitOpAscend can run on npu place only.")); + + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input con not be empty.")); +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo* hccl_id = var->GetMutable(); + + int rank_ids = Attr("rank_ids"); int rank_id = Attr("rank"); + int rid = Attr("ring_id"); int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device; if (Attr("device_id") >= 0) { device_id = Attr("device_id"); } - std::vector rank_ids = Attr>("rank_ids"); - - VLOG(3) << "begin c_comm_init on npu, parameters are: " - << "ring id[" << rid - << "], nranks[" << nranks - << "], rank_id[" << rank_id - << "], device_id[" << device_id - << "]"; - platform::HCCLCommContext::Instance().CreateHCCLComm( - rank_ids, rank_id, device_id, rid); + hccl_id, rank_ids, rank_id, device_id, rid); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif } }; -class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker { +class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); AddComment(R"DOC( -CCommInit operator on NPU +CCommInit operator -Initialize collective communication context within this trainer +Initialize collective communicatoin context within this trainer )DOC"); - AddAttr("nranks", "(int) The number of ranks of distributed trainers"); - AddAttr>("rank_ids", "The world rank ids of the group"); + AddAttr("rank_ids", "(int) The number of ranks of distributed trainers"); AddAttr("rank", "(int) The rank of the trainer in distributed training."); AddAttr("device_id", @@ -89,6 +90,4 @@ Initialize collective communication context within this trainer namespace ops = paddle::operators; -REGISTER_OPERATOR(c_comm_init_hcom, ops::CCommInitOpNPU, ops::CCommInitOpNPUMaker); - -#endif +REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, ops::CCommInitOpAscendMaker); diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc new file mode 100644 index 0000000000000..78f1076f7eac4 --- /dev/null +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#endif + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) { + } + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int rank = Attr("rank"); + framework::Scope& local_scope = scope.NewScope(); + + std::function func = [&](size_t i) -> std::string { + return Output("Out"); + }; + + if (rank == 0) { + std::vector endpoint_list = + Attr>("other_endpoints"); + SendBroadCastHCCLID(endpoint_list, 1, func, local_scope); + } else { + std::string endpoint = Attr("endpoint"); + RecvBroadCastHCCLID(endpoint, 1, func, local_scope); + } + scope.DeleteScope(&local_scope); + } +}; + +#else + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + } +}; + +#endif + +class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + VLOG(3) << "ele"; + AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +CGenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr("endpoint", + "(string), e.g. 127.0.0.1:6175 " + "current listen endpoint"); + AddAttr>( + "other_endpoints", + "['trainer1_ip:port', 'trainer2_ip:port', ...] " + "list of other trainer endpoints") + .SetDefault({}); + AddAttr("rank", + "(int default 0) " + "The rank of the trainer in distributed training.") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 9446c38dcba32..915bf8391992d 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -63,6 +63,7 @@ class CGenNCCLIdOp : public framework::OperatorBase { }; #else + class CGenNCCLIdOp : public framework::OperatorBase { public: CGenNCCLIdOp(const std::string& type, diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 4e9d804fc36ea..a8df0512dc59a 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -121,31 +121,15 @@ class CReduceOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - - // we need to pre-allocate 512 Bytes before the data - // and 512 Bytes after the data, so the hccl allreduce - // can work. This is a must acooding to huawei peer. - #define PRE_MALLOC_SIZE_BYTES 512 - auto in = ctx.Input("X"); auto out = ctx.Output("Out"); auto place = ctx.GetPlace(); - hcclDataType_t dtype = platform::ToHCCLDataType(in->type()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); int64_t numel = in->numel(); - int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T); - int64_t tmp_numel = numel + pre_tmp_size * 2; - - paddle::framework::LoDTensor tmp_in, tmp_out; - tmp_in.Resize({tmp_numel}); - tmp_out.Resize({tmp_numel}); - auto p_tmp_in = tmp_in.mutable_data(place); // allocate - auto p_tmp_out = tmp_out.mutable_data(place); // allocate + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); - void* sendbuff = reinterpret_cast(tmp_in.data() + pre_tmp_size); - void* recvbuff = reinterpret_cast(tmp_out.data() + pre_tmp_size); - - std::string tag = ctx.Attr("tag"); int ring_id = ctx.Attr("ring_id"); int root_id = ctx.Attr("root_id"); std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); @@ -161,33 +145,22 @@ class CReduceOpASCENDKernel : public framework::OpKernel { int rank_id = comm->rank(); - // we need to memset this memory firstly to avoid core by hccl - platform::NPUMemsetAsync(static_cast(p_tmp_in), 0, tmp_numel*sizeof(T), stream); - platform::NPUMemsetAsync(static_cast(p_tmp_out), 0, tmp_numel*sizeof(T), stream); - - auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); - - memory::Copy(npu_place, sendbuff, - npu_place, reinterpret_cast(const_cast(in->data())), - numel * sizeof(T), - stream); - - hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM; + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; switch (red_type) { case kRedSum: - hccl_red_type = HCCL_REP_OP_SUM; + hccl_red_type = HCCL_REDUCE_SUM; break; case kRedMax: - hccl_red_type = HCCL_REP_OP_MAX; + hccl_red_type = HCCL_REDUCE_MAX; break; case kRedMin: - hccl_red_type = HCCL_REP_OP_MIN; + hccl_red_type = HCCL_REDUCE_MIN; break; case kRedProd: - hccl_red_type = HCCL_REP_OP_PROD; + hccl_red_type = HCCL_REDUCE_PROD; break; default: @@ -200,18 +173,14 @@ class CReduceOpASCENDKernel : public framework::OpKernel { << "root_id: " << root_id << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type - << ", group is: " << group - << ", tag is " << tag; + << ", group is: " << group; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce( - tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream)); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream)); - if(rank_id == root_id){ - memory::Copy(npu_place, reinterpret_cast(out->data()), - npu_place, recvbuff, - numel * sizeof(T), - stream); - }else{ + + if(rank_id != root_id){ + auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); memory::Copy(npu_place, reinterpret_cast(out->data()), npu_place, reinterpret_cast(const_cast(in->data())), numel * sizeof(T), diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index 36ec6d155a214..6f99ec5b49896 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/collective/c_reduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -42,7 +43,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_reduce_sum); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU); DECLARE_string(selected_npus); @@ -56,7 +58,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ VLOG(3) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); @@ -82,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); int rank_id = atoi(getenv("RANK_ID")); @@ -101,7 +143,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { tensor_x->Resize({num1, num2}); ctx.Wait(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); tensor_out->mutable_data(place); // allocate @@ -115,8 +157,8 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { attrs["root_id"]=root_id; auto op = f::OpRegistry::CreateOp("c_reduce_sum", - {{"X", {"X"}}}, - {{"Out", {"Out"}}}, + {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); op->Run(*scope, place); @@ -141,11 +183,13 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { TEST(c_reduce_sum, NPU) { f::Scope scope; + HcclRootInfo hccl_id; // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); for(int i = 0; i < 2; i ++){ VLOG(2) << "iter num: " << i; TestHCCLReduceOp(&scope, ctx, i); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc index e38bcdea27da7..22769ae3d2e96 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc @@ -35,7 +35,6 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel { auto place = ctx.GetPlace(); auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); int nranks = comm->nranks(); - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); auto out_dims = in->dims(); PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0, @@ -47,11 +46,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel { out_dims[0] = out_dims[0] / nranks; out->mutable_data(out_dims, place); - int64_t recv_numel = in->numel() / nranks; + uint64_t recv_numel = in->numel() / nranks; void* inputPtr = reinterpret_cast(const_cast(in->data())); void* outputPtr = reinterpret_cast(out->data()); - hcclDataType_t dtype = platform::ToHCCLDataType(in->type()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); aclrtStream stream = nullptr; if (ctx.Attr("use_calc_stream")) { @@ -63,12 +62,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel { VLOG(3) << "begin hccl reduce scatter, parameter is: " << "recv_numel: " << recv_numel << "dtype: " << dtype - << "hccl_red_type: " << HCCL_REP_OP_SUM - << ", group is: " << group - << ", tag is " << tag; + << "hccl_red_type: " << HCCL_REDUCE_SUM + << ", group is: " << group; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter( - tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream)); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter( + inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), (void*)stream)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU.")); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index 1c21ab19b954c..9eb5813174345 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_allreduce_op.h" #include "paddle/fluid/operators/collective/c_allgather_op.h" #include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -45,7 +46,8 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_reducescatter); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_reducescatter, NPU); DECLARE_string(selected_npus); @@ -59,7 +61,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector &data){ VLOG(2) << preStr << ":" << std::endl < rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; f::AttributeMap comm_init_attrs; comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; + comm_init_attrs["rank_ids"] = 2; comm_init_attrs["rank"] = rank_id; comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; + // comm_init_attrs["rank_ids"] = rank_ids; auto comm_init_op = - f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); auto place = ctx.GetPlace(); comm_init_op->Run(*scope, place); ctx.Wait(); } + void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { // init - auto x = scope->Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); std::vector init; @@ -103,7 +146,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { ctx.Wait(); auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); + auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); tensor_out->mutable_data(place); // allocate @@ -116,14 +159,14 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { attrs["ring_id"]=0; attrs["nranks"]=2; - auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}}, - {{"Out", {"Out"}}}, attrs); + auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); int iter_num = 10; for (int i = 0; i < iter_num; i ++) { op->Run(*scope, place); + ctx.Wait(); } - ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -132,16 +175,18 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { PrintDebugInfo("output data", out_vec); EXPECT_EQ(out_vec.size(), init.size() / 2); for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], iter_num + 1); + EXPECT_EQ(out_vec[i], 2.0); } } TEST(c_reducescatter, NPU) { f::Scope scope; + HcclRootInfo hccl_id; // only support one device, if more than one device, use first default p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); - Prepare(&scope, ctx); + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); TestHCCLReduceScatterOp(&scope, ctx); } diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index 371bcc4cfcfef..44e837c66ad61 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -41,7 +41,7 @@ namespace m = paddle::operators::math; USE_OP(c_broadcast); USE_NO_KERNEL_OP(c_sync_comm_stream); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_broadcast, NPU); void Prepare(f::Scope* scope, const p::DeviceContext& ctx) { diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc new file mode 100644 index 0000000000000..fe247df81fcd4 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + std::vector trainers = + Attr>("trainers"); + int trainer_id = Attr("trainer_id"); + std::string endpoint = trainers[trainer_id]; + + PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( + "trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_LT( + trainer_id, static_cast(trainers.size()), + platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " + "range is [0, trainer_size)", + trainer_id)); + + int hccl_comm_num = Attr("hccl_comm_num"); + int use_hierarchical_allreduce = Attr("use_hierarchical_allreduce"); + int inter_nranks = Attr("hierarchical_allreduce_inter_nranks"); + int inter_trainer_id = -1; + int exter_trainer_id = -1; + + if (use_hierarchical_allreduce) { + PADDLE_ENFORCE_GT( + trainers.size(), 1, + platform::errors::PreconditionNotMet( + "The number of collective trainers %llu <= 1", trainers.size())); + PADDLE_ENFORCE_GT( + inter_nranks, 1, + platform::errors::PreconditionNotMet( + "inter_nranks %d <= 1 while in hierarchical allreduce mode", + inter_nranks)); + PADDLE_ENFORCE_EQ( + trainers.size() % inter_nranks, 0, + platform::errors::PreconditionNotMet( + "The number of trainers %llu mod inter_nranks %d is not equal 0", + trainers.size(), inter_nranks)); + + inter_trainer_id = trainer_id % inter_nranks; + + if (trainer_id % inter_nranks == 0) { + exter_trainer_id = trainer_id / inter_nranks; + } + } + + std::ostringstream ss; + for (size_t i = 0; i < trainers.size(); i++) { + ss << trainers[i] << ","; + } + + VLOG(1) << "trainer_id:" << trainer_id + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce + << ", hccl_comm_num:" << hccl_comm_num + << ", inter_nranks:" << inter_nranks + << ", inter_trainer_id:" << inter_trainer_id + << ", exter_trainer_id:" << exter_trainer_id + << ", trainers:" << ss.str(); + + int server_fd = -1; + + /// 1. init flat + std::function func = platform::GetFlatHCCLVarName; + if (trainer_id == 0) { + // server endpoints + std::vector flat_endpoints; + flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1, + trainers.end()); + SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope); + } else { + server_fd = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 2. hierarchical inter ncclid + func = platform::GetHierarchicalInterHCCLVarName; + if (inter_trainer_id == 0) { + std::ostringstream ss; + ss << endpoint; + std::vector inter_endpoints; + for (int i = trainer_id + 1; i < trainer_id + inter_nranks && + i < static_cast(trainers.size()); + i++) { + ss << ","; + inter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope); + } else if (inter_trainer_id > 0) { + VLOG(1) << "Hierarchical inter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 3. hierarchical exter ncclid + func = platform::GetHierarchicalExterHCCLVarName; + if (exter_trainer_id == 0) { + std::ostringstream ss; + std::vector exter_endpoints; + ss << endpoint; + for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) { + ss << ","; + exter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope); + } else if (exter_trainer_id > 0) { + VLOG(1) << "Hierarchical exter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + // close socket server + if (trainer_id != 0) { + CloseSocket(server_fd); + } + } +}; + +#else +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + } +}; + +#endif + +class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +GenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr>( + "trainers", + "['trainer0_ip:port', 'trainer1_ip:port', ...] " + "list of all trainer endpoints") + .SetDefault({}); + AddAttr("trainer_id", + "(int) " + "The index of the trainer in distributed training."); + AddAttr("hccl_comm_num", + "(int default 1) " + "The number of nccl communicator num.") + .SetDefault(1); + AddAttr("use_hierarchical_allreduce", + "(bool default false) " + "Wheter to use hierarchical allreduce.") + .SetDefault(false); + AddAttr("hierarchical_allreduce_inter_nranks", + "(int default 1) " + "Wheter to use hierarchical allreduce.") + .SetDefault(-1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc new file mode 100644 index 0000000000000..c565ea4ae027f --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -0,0 +1,349 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +constexpr char COMM_HEAD[] = "_pd_gen_comm_id_"; +#define HCCL_UNIQUE_ID_BYTES 1024 + +// Check system calls, such as socket, bind. +#define CHECK_SYS_CALL(call, name) \ + do { \ + int retval; \ + CHECK_SYS_CALL_VAL(call, name, retval); \ + } while (false) + +#define CHECK_SYS_CALL_VAL(call, name, retval) \ + do { \ + RETRY_SYS_CALL_VAL(call, name, retval); \ + if (retval == -1) { \ + PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \ + name, strerror(errno))); \ + } \ + } while (false) + +#define RETRY_SYS_CALL_VAL(call, name, retval) \ + do { \ + retval = (call); \ + if (retval == -1 && \ + (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \ + << " retry"; \ + } else { \ + break; \ + } \ + } while (true) + +static int SocketSend(int fd, const char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = send(fd, buffer + offset, size - offset, 0); + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + // send failed + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static int SocketRecv(int fd, char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = recv(fd, buffer + offset, size - offset, 0); + if (bytes == 0) { + // closed by client, maybe probing alive client + return 0; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static void BindOrConnectFailed(int timeout, int* try_times, int* total_time, + const char* op, const std::string& ep) { + PADDLE_ENFORCE_LT( + *total_time, timeout, + platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op, + ep.c_str(), strerror(errno))); + ++(*try_times); + int retry_time = std::min(*try_times * 500, 3000); // max 3 seconds + *total_time += retry_time; + + LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times + << " times with reason: " << strerror(errno) << " retry after " + << retry_time / 1000.0 << " seconds"; + std::this_thread::sleep_for(std::chrono::milliseconds(retry_time)); +} + +int CreateListenSocket(const std::string& ep) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + // creating socket fd + int server_fd = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd); + + // NOTE. Solutions to `Address already in use`. + // 1. Reuse addr&port. Otherwise, once the server closes the socket + // before client, the server will enter TIME-WAIT status. If we bind port + // again, the error `Address already in use` will appear. + // 2. Or we can close the client first to ensure that the server does + // not enter the TIME-WAIT state. But this is obviously not as convenient + // as the reuse method. + int opt = 1; +#if defined(SO_REUSEPORT) + // since Linux kernel 3.9 + CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, + &opt, sizeof(opt)), + "setsockopt"); +#else + CHECK_SYS_CALL( + setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), + "setsockopt"); +#endif + + struct sockaddr_in address; + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind", + ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep); + continue; + } + break; + } + + CHECK_SYS_CALL(listen(server_fd, 3), "listen"); + LOG(INFO) << "Server listening on: " << ep << " successful."; + return server_fd; +} + +void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); } + +static int SocketAccept(int server_fd, const char* head) { + struct sockaddr_in client_addr; + socklen_t addr_length = sizeof(client_addr); + char buffer[1024] = {0}; + int conn = -1; + + while (true) { + CHECK_SYS_CALL_VAL( + accept(server_fd, reinterpret_cast(&client_addr), + &addr_length), + "accept", conn); + + int ret_val = SocketRecv(conn, buffer, strlen(head)); + if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) { + break; // accept client + } else { + VLOG(3) << "socket read failed with ret_val=" << ret_val; + CloseSocket(conn); + } + } + return conn; +} + +static int ConnectAddr(const std::string& ep, const char* head) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int sock = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock); + + struct sockaddr_in server_addr; + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_port = htons(port); + + char* ip = NULL; + struct hostent* hp = NULL; + hp = gethostbyname(host.c_str()); + PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( + "Fail to get host by name %s.", host)); + + int i = 0; + while (hp->h_addr_list[i] != NULL) { + ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); + VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip; + break; + } + + PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0, + platform::errors::Unavailable("Open address %s failed: %s", + ep, strerror(errno))); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)), + "connect", ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep); + continue; + } + + CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send"); + break; + } + return sock; +} + +static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + static_assert(HCCL_UNIQUE_ID_BYTES <= 1024, + "hccl id bytes must <= buffer size"); + + CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), "recv hccl id"); + memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES); +} + +static void SendHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES); + + CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "send hccl id"); +} + +void SendBroadCastHCCLID(std::vector servers, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + // connect with server + std::vector connects; + for (auto server : servers) { + VLOG(3) << "connecting endpoint: " << server; + int conn = ConnectAddr(server, COMM_HEAD); + connects.push_back(conn); + } + VLOG(3) << "connecting completed..."; + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id)); + + int j = 0; + for (auto conn : connects) { + VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j] + << " hccl_comm_no: " << i; + SendHCCLID(conn, hccl_id); + ++j; + } + VLOG(3) << "sending completed..."; + } + + // close client + for (auto conn : connects) { + CloseSocket(conn); + } +} + +void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int server = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope); + CloseSocket(server); +} + +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int client = SocketAccept(server_fd, COMM_HEAD); + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + + VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name + << " from trainer 0, hccl_comm_no: " << i; + RecvHCCLID(client, hccl_id); + } + VLOG(3) << "receiving completed..."; + CloseSocket(client); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h new file mode 100644 index 0000000000000..1ad6f791e1fc3 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { + +int CreateListenSocket(const std::string& ep); + +void CloseSocket(int fd); + +void SendBroadCastHCCLID(std::vector servers, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// server listen on endpoint, then recv nccl id +void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// recv nccl id from socket +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc index 80dad82386d1f..0db2bd3298302 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -27,32 +27,39 @@ class CRecvOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - auto out = ctx.Output("Out"); - int numel = out->numel(); - hcclDataType_t dtype = platform::ToHCCLDataType(out->type()); + auto x = ctx.Output("Out"); + void *ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); int ring_id = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); - auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); if (ctx.Attr("use_calc_stream")) { - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); - std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); - int srcRank = ctx.Attr("peer"); - int srTag = ctx.Attr("srTag"); - VLOG(3) << "recv_v2_npu attr get"; - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive( - tag.c_str(), reinterpret_cast(const_cast(out->data())), (u64)numel, dtype, srcRank, - srTag, group.c_str(), stream)); - VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving "; - out->Resize(out->dims()); - out->set_lod(out->lod()); + + int nranks = comm->nranks(); + int peer = ctx.Attr("peer"); + + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", + nranks)); + + int root = peer; + + VLOG(3) << "begin hccl recv, parameter is: "<< "root " << root + << ", comm: " << comm->comm() << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel, + dtype, (uint32_t)root, comm->comm(), stream)); + #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU.")); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 727d8be5a8f9a..22b90920e2891 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -31,6 +31,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/collective/recv_v2_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -42,30 +44,72 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(recv_v2); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(recv_v2, NPU); -void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ - std::string rank_table_file = getenv("RANK_TABLE_FILE"); - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - int src_rank = atoi(getenv("SRC_RANK")); - int dest_rank = atoi(getenv("DEST_RANK")); - VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" < rank_ids = {0,1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); - VLOG(3) << "CreateOp c_comm_init_hcom"; - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); } void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ @@ -79,7 +123,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ ctx.Wait(); auto place = ctx.GetPlace(); - auto out = scope->Var("Out"); + auto out = scope->Var("Data"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num, num}); tensor_out->mutable_data(place); // allocate @@ -96,7 +140,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ out_shape.push_back(num); attrs["out_shape"]=out_shape; - auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs); + auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs); VLOG(3) << "CreateOp recv_v2"; for (int i = 0; i < 10; i ++) { @@ -113,12 +157,13 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){ TEST(recv_v2, NPU){ f::Scope scope; + HcclRootInfo hccl_id; + char * npu_id=getenv("FLAGS_selected_npus"); VLOG(3) << "Select npu:" << npu_id; p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); - VLOG(3) << "Place over"; - Prepare(&scope, ctx); - VLOG(3) << "Prepare over"; + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); TestHcomRecvOp(&scope, ctx); - VLOG(3) << "Test over"; } diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc index 15fcec269c569..621c9ffe4cd17 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -28,31 +28,37 @@ class CSendOpASCENDKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) auto x = ctx.Input("X"); + void *ptr = reinterpret_cast(const_cast(x->data())); int numel = x->numel(); - hcclDataType_t dtype = platform::ToHCCLDataType(x->type()); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); - auto place = ctx.GetPlace(); int ring_id = ctx.Attr("ring_id"); - auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + auto place = ctx.GetPlace(); + auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); if (ctx.Attr("use_calc_stream")) { - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); stream = static_cast(dev_ctx)->stream(); } else { stream = comm->stream(); } - std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId()); - std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); - int destRank = ctx.Attr("peer"); - int srTag = ctx.Attr("srTag"); - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send( - tag.c_str(), reinterpret_cast(const_cast(x->data())), (u64)numel, dtype, destRank, - srTag, group.c_str(), stream)); + int nranks = comm->nranks(); + int rank = comm->rank(); + + PADDLE_ENFORCE_EQ(nranks, 2, + platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", + nranks)); + + int root = rank; + + VLOG(3) << "begin hccl send, parameter is: "<< "root " << root + << ", comm: " << comm->comm() << ", stream: " << stream; - VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent " - << x->numel(); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel, + dtype, (uint32_t)root, comm->comm(), stream)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index 7916d155ee761..3f47870b841c9 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/collective/send_v2_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" #if defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" @@ -41,34 +42,78 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(send_v2); -USE_NO_KERNEL_OP(c_comm_init_hcom); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(send_v2, NPU); -void Prepare(f::Scope* scope, const p::DeviceContext& ctx){ - std::string rank_table_file = getenv("RANK_TABLE_FILE"); - int rank_id = atoi(getenv("RANK_ID")); - int device_id = atoi(getenv("DEVICE_ID")); - int src_rank = atoi(getenv("SRC_RANK")); - int dest_rank = atoi(getenv("DEST_RANK")); - VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" < rank_ids = {0, 1}; - f::AttributeMap comm_init_attrs; - comm_init_attrs["ring_id"] = 0; - comm_init_attrs["nranks"] = 2; - comm_init_attrs["rank"] = rank_id; - comm_init_attrs["device_id"] = device_id; - comm_init_attrs["rank_ids"] = rank_ids; - auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs); - auto place = ctx.GetPlace(); - comm_init_op->Run(*scope, place); - ctx.Wait(); +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + + std::vector endpointList={"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = + f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){ + + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id + << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = + f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); } void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ std::cout<< "BEGIN TEST:"<< __FUNCTION__ <Var("X"); + auto x = scope->Var("Data"); auto tensor_x = x->GetMutable(); int num = atoi(getenv("DATA_SIZE"));; EXPECT_GT(num, 0); @@ -88,24 +133,24 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){ attrs["ring_id"]=0; attrs["srTag"]=0; - auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs); - + auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs); + for (int i = 0; i < 10; i ++) { op->Run(*scope, place); } VLOG(3)<<"send run over"; - ctx.Wait(); + ctx.Wait(); } TEST(send_v2, NPU){ f::Scope scope; + HcclRootInfo hccl_id; + char * npu_id=getenv("FLAGS_selected_npus"); VLOG(3) << "Select npu:" << npu_id; p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); - VLOG(3) << "Place over"; - Prepare(&scope, ctx); - VLOG(3) << "Prepare over"; - TestHcomSendOp(&scope, ctx); - VLOG(3) << "Test over"; + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomSendOp(&scope, ctx); } diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 4516aa38fb39b..320b498156b3f 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -28,6 +28,12 @@ class LookupTableV2NPUKernel : public framework::OpKernel { auto *ids_t = ctx.Input("Ids"); // int tensor auto *output_t = ctx.Output("Out"); // float tensor auto *table_t = ctx.Input("W"); + + // It seems cann 20.1 accepts int64, but cann 20.2+ not. + PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32, + platform::errors::Unimplemented( + "The index of LookupTableV2 should be int32.")); + auto *table_var = ctx.InputVar("W"); PADDLE_ENFORCE_EQ( table_var->IsType(), true, @@ -49,28 +55,26 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto *ids_t = ctx.Input("Ids"); + auto *output_grad_t = ctx.Input(framework::GradVarName("Out")); auto *table_grad_t = ctx.Output(framework::GradVarName("W")); - table_grad_t->mutable_data(ctx.GetPlace()); + auto *p = table_grad_t->mutable_data(ctx.GetPlace()); auto stream = ctx.template device_context() .stream(); - // step2: ZerosLike x in device - Tensor zeroslike_w(table_grad_t->type()); - zeroslike_w.Resize(table_grad_t->dims()); - auto p = zeroslike_w.mutable_data(ctx.GetPlace()); - platform::NPUMemsetAsync(static_cast(p), 0, - zeroslike_w.numel() * sizeof(T), stream); + table_grad_t->numel() * sizeof(T), stream); - table_grad_t->mutable_data(ctx.GetPlace()); + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. auto runner_scatter = - NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t}, - {*table_grad_t}, {}); + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); runner_scatter.Run(stream); } }; diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc deleted file mode 100644 index f37915834bd75..0000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/string/printf.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; -namespace m = paddle::operators::math; - -USE_OP(lookup_table_v2); -USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto ids = scope->Var("Ids"); - auto out = scope->Var("Out"); - auto w = scope->Var("W"); - - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto w_t = w->GetMutable(); - int bsz = 10; - int dim = 32; - int seqlen = 8; - int vocab_size = 100; - TensorFromVector(std::vector(bsz * seqlen, 3), ctx, ids_t); - std::vector val(vocab_size * dim, 10.); - TensorFromVector(val, ctx, w_t); - ids_t->Resize({bsz, seqlen}); - w_t->Resize({vocab_size, dim}); - out_t->Resize({bsz, seqlen, dim}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp("lookup_table_v2", - {{"W", {"W"}}, {"Ids", {"Ids"}}}, - {{"Out", {"Out"}}}, attrs); - op->Run(*scope, place); - std::vector out_v; - TensorToVector(*out_t, ctx, &out_v); - ctx.Wait(); - EXPECT_EQ(out_t->numel(), bsz * seqlen * dim); - T res = std::accumulate(out_v.begin(), out_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto w = scope->Var("W"); - auto ids = scope->Var("Ids"); - auto out = scope->Var("DOut"); - auto dw = scope->Var("DW"); - - auto w_t = w->GetMutable(); - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto dw_t = dw->GetMutable(); - - int bsz = 2; - int dim = 2; - int seqlen = 2; - int vocab_size = 4; - - std::vector val_int(bsz * seqlen, 3); - std::vector val(vocab_size * dim, 0.); - std::vector val_out(bsz * seqlen * dim, 1.); - - TensorFromVector(val_int, ctx, ids_t); - TensorFromVector(val, ctx, w_t); - TensorFromVector(val, ctx, dw_t); - TensorFromVector(val_out, ctx, out_t); - - w_t->Resize({vocab_size, dim}); - ids_t->Resize({bsz, seqlen}); - out_t->Resize({bsz, seqlen, dim}); - dw_t->Resize({vocab_size, dim}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - w_t->mutable_data(place); - dw_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp( - "lookup_table_v2_grad", - {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}}, - {{"W@GRAD", {"DW"}}}, attrs); - op->Run(*scope, place); - ctx.Wait(); - std::vector w_v; - TensorToVector(*dw_t, ctx, &w_v); - ctx.Wait(); - EXPECT_EQ(dw_t->numel(), vocab_size * dim); - T res = std::accumulate(w_v.begin(), w_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim), eps); -} - -TEST(lookup_table_v2, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); -} - -TEST(lookup_table_v2_grad, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); -} diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index e4fa4a96a5cf7..5bce5719d7cea 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor, framework::LoDTensor cpu_tensor; platform::CPUPlace cpu_place; TensorCopy(print_tensor, cpu_place, &cpu_tensor); +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(print_tensor.place())) { + platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait(); + } +#endif data = cpu_tensor.data(); } diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 8a205fdf0b787..194f3e3332c11 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -157,15 +157,10 @@ class HCCLComm { virtual int nranks() const = 0; virtual int rank() const = 0; virtual int device_id() const = 0; + virtual HcclComm comm() const = 0; virtual aclrtStream stream() const = 0; virtual NPUDeviceContext* dev_context() const = 0; virtual ~HCCLComm() = default; - - unsigned long NextTagId() { - return tag_counter_++; - } - private: - std::atomic tag_counter_; }; // A singleton HCCL communicator context reserves communication ring ids @@ -176,11 +171,12 @@ class HCCLCommContext { return comm_ctx; } - HCCLComm* CreateHCCLComm(const std::vector& world_rank_ids, int rank, int dev_id, int ring_id = 0); - + HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, + int rank, int dev_id, int ring_id); // a latter comm with the same dev_id and the same ring_id // will override the former - HCCLComm* AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id = 0); + HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, + int dev_id, int ring_id); // retrieve a communicator by the ring id in multiprocessing mode HCCLComm* Get(int ring_id) const { @@ -217,20 +213,21 @@ class HCCLCommContext { private: // Init global hcom - HCCLCommContext() { InitHcomWorldGroup(); } + HCCLCommContext() {} + // we may use group feature in the feature + // HCCLCommContext() { InitHcomWorldGroup(); } + HcclComm comm_; public: - ~HCCLCommContext(){ - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_destroy()); - } + ~HCCLCommContext(){ } std::once_flag once_flag_; std::mutex comm_map_mutex_; // ring id to dev-HCCLComm std::map>> comm_map_; - void InitHcomWorldGroup(); + // void InitHcomWorldGroup(); void ReleaseHCCLComms(); DISABLE_COPY_AND_ASSIGN(HCCLCommContext); diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc index 56cfdec4b5e70..4efa54e774aa3 100644 --- a/paddle/fluid/platform/collective_helper_npu.cc +++ b/paddle/fluid/platform/collective_helper_npu.cc @@ -34,6 +34,13 @@ class HCCLCommImpl : public HCCLComm { return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device; } + ~HCCLCommImpl(){ + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_)); + } + + void set_comm(HcclComm comm) { comm_ = comm; } + HcclComm comm() const override { return comm_; } + aclrtStream stream() const override { return dev_ctx_->stream(); } void set_dev_ctx(std::unique_ptr&& dev_ctx) { @@ -45,46 +52,43 @@ class HCCLCommImpl : public HCCLComm { int ring_id_; int nranks_; int rank_; + HcclComm comm_; std::unique_ptr dev_ctx_; }; -HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector& world_rank_ids, int rank, int dev_id, int ring_id) { +HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, + int rank, int dev_id, int ring_id) { + PADDLE_ENFORCE_NOT_NULL(hccl_id, + platform::errors::InvalidArgument( + "The hccl unique id should not be null.")); PADDLE_ENFORCE_GT( - world_rank_ids.size(), 1, + nranks, 1, platform::errors::InvalidArgument( - "Expected world_rank_ids.size() > 1. But received size is %d.", world_rank_ids.size())); + "Expected nranks > 1. But received nranks is %d.", nranks)); PADDLE_ENFORCE_GE(rank, 0, platform::errors::InvalidArgument( "Expected rank >= 0. But received rank is %d.", rank)); PADDLE_ENFORCE_LT( - rank, world_rank_ids.size(), + rank, nranks, platform::errors::InvalidArgument( "Expected rank < nranks. But received rank is %d, nranks is %d.", - rank, world_rank_ids.size())); + rank, nranks)); PADDLE_ENFORCE_GE( dev_id, 0, platform::errors::InvalidArgument( "Expected dev_id >= 0. But received dev_id is %d.", dev_id)); - PADDLE_ENFORCE_GE( - ring_id, 0, - platform::errors::InvalidArgument( - "Expected ring_id >= 0. But received ring_id is %d.", ring_id)); - - auto* comm_wrapper = AssignHCCLComm(world_rank_ids.size(), rank, dev_id, ring_id); - // HACK(sunpeng17): hcom API requires bind stream to a model - // but we don't need model in Paddle, so we feed stream pointer as model pointer + HcclComm comm; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id)); PADDLE_ENFORCE_NPU_SUCCESS( - platform::dynload::hcom_bind_model(comm_wrapper->stream(), - comm_wrapper->stream())); + platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm)); + + VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks << ", hccl_id: " << hccl_id << ", rank: " << rank; - // Get world_rank_ids registered in gen_nccl_id op - std::string group_name = HCOM_GROUP_PREFIX + std::to_string(ring_id); - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_create_group( - group_name.c_str(), world_rank_ids.size(), (unsigned int*)world_rank_ids.data())); + auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id); VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id - << " has been created on device " << dev_id << ", group name: " << group_name; + << " has been created on device " << dev_id << ", with comm: " << comm_wrapper->comm(); std::call_once(once_flag_, []() { std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); }); @@ -93,7 +97,8 @@ HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector& world_rank_ids return comm_wrapper; } -HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id) { +HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank, + int dev_id, int ring_id) { std::unique_ptr dev_ctx( new NPUDeviceContext(NPUPlace(dev_id))); @@ -101,6 +106,7 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int c->set_ring_id(ring_id); c->set_nranks(nranks); c->set_rank(rank); + c->set_comm(comm); c->set_dev_ctx(std::move(dev_ctx)); comm_map_mutex_.lock(); @@ -112,23 +118,14 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int dev2comm.emplace(dev_id, std::unique_ptr(c)); comm_map_mutex_.unlock(); - return comm_map_[ring_id][dev_id].get(); -} - -void HCCLCommContext::InitHcomWorldGroup() { - const char *rank_table_file = getenv(ENV_RANK_TABLE_FILE); - PADDLE_ENFORCE_NOT_NULL( - rank_table_file, - platform::errors::InvalidArgument("The RANK_TABLE_FILE environment variable should not be null.")); - - const char *rank_id = getenv(ENV_RANK_ID); - PADDLE_ENFORCE_NOT_NULL( - rank_id, - platform::errors::InvalidArgument("The RANK_ID environment variable should not be null.")); + if (ring_id == 0) { + auto* dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::NPUPlace(dev_id))); + dev_ctx->set_hccl_comm(comm); + } - PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_init(rank_table_file, rank_id)); - VLOG(3) << "Successfully initialized hcom. rank_table_file: " - << rank_table_file << ", rank_id " << rank_id; + return comm_map_[ring_id][dev_id].get(); } void HCCLCommContext::ReleaseHCCLComms() { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index bf34e57b773fa..da602bfd5290e 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -185,11 +185,21 @@ class NPUDeviceContext : public DeviceContext { void WaitStreamCallback() const { return stream_->WaitCallback(); } +#if defined(PADDLE_WITH_ASCEND_CL) + /*! \brief Return hccl communicators. */ + HcclComm hccl_comm() const { return hccl_comm_; } + + /*! \brief Set hccl communicators. */ + void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; } +#endif + private: NPUPlace place_; aclrtContext context_; -#ifdef PADDLE_WITH_ASCEND_HCCL - HCCLContext_t hccl_context_; + +#ifdef PADDLE_WITH_ASCEND_CL + // HCCLContext_t hccl_context_; + HcclComm hccl_comm_{nullptr}; #endif // Need to be the same with other DeviceContext, diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h index 2ad62213415c9..328a4e0e78d31 100644 --- a/paddle/fluid/platform/dynload/hccl.h +++ b/paddle/fluid/platform/dynload/hccl.h @@ -13,14 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -// #include -// #include +#include +#include #include // NOLINT #include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/dynload/hcom.h" +// #include "paddle/fluid/platform/dynload/hcom.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#define HCOM_GROUP_PREFIX "HCOM_GROUP_" + namespace paddle { namespace platform { namespace dynload { @@ -43,27 +45,14 @@ extern void* hccl_dso_handle; extern DynLoad__##__name __name #define HCCL_RAND_ROUTINE_EACH(__macro) \ - __macro(hcom_init); \ - __macro(hcom_destroy); \ - __macro(hcom_bind_model); \ - __macro(hcom_unbind_model); \ - __macro(hcom_send); \ - __macro(hcom_receive); \ - __macro(hcom_broadcast); \ - __macro(hcom_all_gather); \ - __macro(hcom_all_reduce); \ - __macro(hcom_reduce_scatter); \ - __macro(hcom_create_group); \ - __macro(hcom_destroy_group); \ - __macro(hcom_get_rank_id); \ - __macro(hcom_get_local_rank_id); \ - __macro(hcom_get_local_rank_size); \ - __macro(hcom_get_split_strategy); \ - __macro(hcom_set_split_strategy_by_size); \ - __macro(hcom_set_split_strategy_by_index); \ - __macro(hcom_get_group_rank_from_world_rank); \ - __macro(hcom_get_world_rank_from_group_rank); - + __macro(HcclReduceScatter); \ + __macro(HcclCommDestroy); \ + __macro(HcclAllReduce); \ + __macro(HcclCommInitRootInfo); \ + __macro(HcclGetRootInfo); \ + __macro(HcclBroadcast); \ + __macro(HcclCommInitClusterInfo); \ + __macro(HcclAllGather); HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 85d255a1f91c1..a3fc148265e8c 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -40,7 +40,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "acl/acl.h" -#include "paddle/fluid/platform/dynload/hcom.h" +#include "hccl/hccl_types.h" #endif // PADDLE_WITH_ASCEND_CL #include @@ -1013,7 +1013,7 @@ struct NPUStatusType {}; } DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); -DEFINE_NPU_STATUS_TYPE(hcclResult_t, HCCL_SUCCESS); +DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS); } // namespace details inline std::string build_npu_error_msg(aclError stat) { @@ -1022,7 +1022,7 @@ inline std::string build_npu_error_msg(aclError stat) { return sout.str(); } -inline std::string build_npu_error_msg(hcclResult_t stat) { +inline std::string build_npu_error_msg(HcclResult stat) { std::ostringstream sout; sout << " HCCL error, the error code is : " << stat << ". "; return sout.str(); diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h index e2d6be57ceecb..8eed32b416da7 100644 --- a/paddle/fluid/platform/hccl_helper.h +++ b/paddle/fluid/platform/hccl_helper.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL) +#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL) #include #include @@ -24,30 +24,22 @@ #include #include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/platform/collective_helper.h" - -#ifdef PADDLE_WITH_NCCL -#include "paddle/fluid/platform/dynload/nccl.h" -#endif - -#ifdef PADDLE_WITH_RCCL -#include "paddle/fluid/platform/dynload/rccl.h" -#endif #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/dynload/hccl.h" #endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" -#define NCCL_ID_VARNAME "NCCLID" +#define HCCL_ID_VARNAME "HCCLID" namespace paddle { namespace platform { -inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) { +inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { if (type == framework::proto::VarType::FP32) { return HCCL_DATA_TYPE_FP32; } else if (type == framework::proto::VarType::FP16) { @@ -56,7 +48,7 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) { return HCCL_DATA_TYPE_INT32; } else if (type == framework::proto::VarType::INT8) { return HCCL_DATA_TYPE_INT8; - } + } // else if (type == framework::proto::VarType::FP64) { // return HCCL_DATA_TYPE_FP32; // } @@ -66,298 +58,301 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) { } } -// // NOTE(minqiyang): according to the ncclGroupEnd documentations: -// // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, -// // ncclGroupEnd will wait for all communicators to be initialized, which will -// // cause blocking problem when a runtime_error was thrown, so try only guard -// // NCCL actions when use it. -// class NCCLGroupGuard { +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// HCCL actions when use it. + +// class HCCLGroupGuard { // public: -// static std::mutex &NCCLMutex() { +// static std::mutex &HCCLMutex() { // static std::mutex mtx; // return mtx; // } -// inline NCCLGroupGuard() { -// NCCLMutex().lock(); +// inline HCCLGroupGuard() { +// HCCLMutex().lock(); // PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); // } -// inline ~NCCLGroupGuard() PADDLE_MAY_THROW { +// inline ~HCCLGroupGuard() PADDLE_MAY_THROW { // PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); -// NCCLMutex().unlock(); +// HCCLMutex().unlock(); // } // }; -// struct NCCLContext { -// std::unique_ptr ctx_; -// ncclComm_t comm_; +struct HCCLContext { + std::unique_ptr ctx_; + HcclComm comm_; -// explicit NCCLContext(int dev_id) -// : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {} + explicit HCCLContext(int dev_id) + : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {} -// gpuStream_t stream() const { return ctx_->stream(); } -// ncclComm_t comm() const { return comm_; } + aclrtStream stream() const { return ctx_->stream(); } + HcclComm comm() const { return comm_; } -// int device_id() const { -// return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device; -// } -// }; + int device_id() const { + return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device; + } +}; + +struct HCCLContextMap { + std::unordered_map contexts_; + std::vector order_; + + explicit HCCLContextMap(const std::vector &places, + HcclRootInfo *hccl_id = nullptr, + size_t num_trainers = 1, size_t trainer_id = 0) { + PADDLE_ENFORCE_EQ(!places.empty(), true, + platform::errors::InvalidArgument( + "The HCCL place should not be empty.")); + order_.reserve(places.size()); + for (auto &p : places) { + int dev_id = BOOST_GET_CONST(NPUPlace, p).device; + order_.emplace_back(dev_id); + contexts_.emplace(dev_id, HCCLContext(dev_id)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + platform::errors::Unavailable("HCCL Context Map does not support " + "contain two or more same device.")); + + std::unique_ptr comms(new HcclComm[order_.size()]); + // if num_trainers == 1, should create a new nccl id for local comms. + if (num_trainers == 1 && hccl_id == nullptr) { + // we do not know how to tackle this situation under hccl + // std::lock_guard guard(HCCLGroupGuard::HCCLMutex()); + // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll( + // comms.get(), static_cast(order_.size()), order_.data())); + } else { + PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument( + "The HCCL id should not be null.")); + { + int nranks = num_trainers * order_.size(); + // HCCLGroupGuard gurad; + for (size_t i = 0; i < order_.size(); ++i) { + int gpu_id = order_[i]; + int rank; + if (order_.size() > 1) { + rank = trainer_id * order_.size() + i; + } else { + rank = trainer_id; + } + VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks + << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; + aclrtSetDevice(gpu_id); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo( + nranks, hccl_id, rank, comms.get() + i)); + } + } + } + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } + } -// struct NCCLContextMap { -// std::unordered_map contexts_; -// std::vector order_; - -// explicit NCCLContextMap(const std::vector &places, -// ncclUniqueId *nccl_id = nullptr, -// size_t num_trainers = 1, size_t trainer_id = 0) { -// PADDLE_ENFORCE_EQ(!places.empty(), true, -// platform::errors::InvalidArgument( -// "The NCCL place should not be empty.")); -// order_.reserve(places.size()); -// for (auto &p : places) { -// int dev_id = BOOST_GET_CONST(CUDAPlace, p).device; -// order_.emplace_back(dev_id); -// contexts_.emplace(dev_id, NCCLContext(dev_id)); -// } -// PADDLE_ENFORCE_EQ( -// order_.size(), contexts_.size(), -// platform::errors::Unavailable("NCCL Context Map does not support " -// "contain two or more same device.")); - -// std::unique_ptr comms(new ncclComm_t[order_.size()]); -// // if num_trainers == 1, should create a new nccl id for local comms. -// if (num_trainers == 1 && nccl_id == nullptr) { -// std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); -// PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( -// comms.get(), static_cast(order_.size()), order_.data())); -// } else { -// PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument( -// "The NCCL id should not be null.")); -// { -// int nranks = num_trainers * order_.size(); -// NCCLGroupGuard gurad; -// for (size_t i = 0; i < order_.size(); ++i) { -// int gpu_id = order_[i]; -// int rank; -// if (order_.size() > 1) { -// rank = trainer_id * order_.size() + i; -// } else { -// rank = trainer_id; -// } -// VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks -// << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; -// SetDeviceId(gpu_id); -// PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( -// comms.get() + i, nranks, *nccl_id, rank)); -// } -// } -// } -// int i = 0; -// for (auto &dev_id : order_) { -// contexts_.at(dev_id).comm_ = comms[i++]; -// } -// } + HCCLContextMap(const HCCLContextMap &other) = delete; + HCCLContextMap &operator=(const HCCLContextMap &other) = delete; -// NCCLContextMap(const NCCLContextMap &other) = delete; -// NCCLContextMap &operator=(const NCCLContextMap &other) = delete; + NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } -// CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + NPUDeviceContext *DevCtx(platform::Place p) const { + return DevCtx(BOOST_GET_CONST(NPUPlace, p).device); + } -// CUDADeviceContext *DevCtx(platform::Place p) const { -// return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device); -// } + const HCCLContext &at(platform::Place p) const { + return this->at(BOOST_GET_CONST(NPUPlace, p).device); + } -// const NCCLContext &at(platform::Place p) const { -// return this->at(BOOST_GET_CONST(CUDAPlace, p).device); -// } + const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } -// const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } + void WaitAll() { + for (auto &p : contexts_) { + p.second.ctx_->Wait(); + } + } +}; -// void WaitAll() { -// for (auto &p : contexts_) { -// p.second.ctx_->Wait(); -// } -// } -// }; +inline std::string GetFlatHCCLVarName(size_t pos) { + if (pos == 0) { + return HCCL_ID_VARNAME; + } + return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast(pos)); +} -// inline std::string GetFlatNCCLVarName(size_t pos) { -// if (pos == 0) { -// return NCCL_ID_VARNAME; -// } -// return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast(pos)); -// } - -// inline std::string GetHierarchicalExterNCCLVarName(size_t pos) { -// return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME, -// static_cast(pos)); -// } -// inline std::string GetHierarchicalInterNCCLVarName(size_t pos) { -// return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME, -// static_cast(pos)); -// } - -// class NCCLCommunicator { -// public: -// NCCLCommunicator() {} -// virtual ~NCCLCommunicator() PADDLE_MAY_THROW {} +inline std::string GetHierarchicalExterHCCLVarName(size_t pos) { + return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME, + static_cast(pos)); +} +inline std::string GetHierarchicalInterHCCLVarName(size_t pos) { + return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME, + static_cast(pos)); +} -// NCCLContextMap *DefaultFlatCtx() const { -// if (flat_ctxs_.size() == 0) { -// return nullptr; -// } +class HCCLCommunicator { + public: + HCCLCommunicator() {} + virtual ~HCCLCommunicator() PADDLE_MAY_THROW {} -// return flat_ctxs_[0].get(); -// } + HCCLContextMap *DefaultFlatCtx() const { + if (flat_ctxs_.size() == 0) { + return nullptr; + } -// std::vector> *GetFlatCtxs() { -// return &flat_ctxs_; -// } + return flat_ctxs_[0].get(); + } -// NCCLContextMap *GetFlatCtx(size_t run_order) const { -// return flat_ctxs_[run_order % flat_ctxs_.size()].get(); -// } + std::vector> *GetFlatCtxs() { + return &flat_ctxs_; + } -// NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order, -// bool use_hierarchical_allreduce) const { -// if (!use_hierarchical_allreduce) { -// return GetFlatCtx(run_order); -// } + HCCLContextMap *GetFlatCtx(size_t run_order) const { + return flat_ctxs_[run_order % flat_ctxs_.size()].get(); + } -// return GetHierarchicalInterCtx(run_order); -// } + HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order, + bool use_hierarchical_allreduce) const { + if (!use_hierarchical_allreduce) { + return GetFlatCtx(run_order); + } - -// *When nccl inits nccl comm using ncclCommInitAll, it meets error when -// *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So -// *create a new nccl comm for sync_batch_norm_op. And these codes should be -// *polished with a unified nccl management. - -// NCCLContextMap *GetSyncBatchNormCtx( -// framework::Scope *scope, const std::vector &places) { -// auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); -// if (nccl_id_var != nullptr) { -// return DefaultFlatCtx(); -// } - -// if (sync_batch_norm_ctx_.get() == nullptr) { -// sync_batch_norm_ctx_.reset(new NCCLContextMap(places)); -// } -// return sync_batch_norm_ctx_.get(); -// } + return GetHierarchicalInterCtx(run_order); + } -// void InitFlatCtxs(const std::vector &places, -// const std::vector &nccl_ids, -// size_t trainers_num, size_t trainer_id) { -// if (nccl_ids.size() == 0) { -// auto ptr = new platform::NCCLContextMap(places); -// VLOG(1) << "init local trainer"; -// flat_ctxs_.emplace_back(ptr); -// } else { -// for (size_t i = 0; i < nccl_ids.size(); i++) { -// auto ptr = new platform::NCCLContextMap(places, nccl_ids[i], -// trainers_num, trainer_id); -// VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i; -// flat_ctxs_.emplace_back(ptr); -// } -// } - -// // as Executor have no way to use ncclComm created by ParallelExecutor, -// // we assign all flatten contexts to NCCLCommContext to fix. -// int nranks = static_cast(trainers_num * places.size()); -// int nrings = static_cast(flat_ctxs_.size()); -// for (int ring_id = 0; ring_id < nrings; ++ring_id) { -// for (size_t p = 0; p < places.size(); ++p) { -// int rank = trainer_id * places.size() + p; -// int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device; -// auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id); -// NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank, -// dev_id, ring_id); -// } -// } -// } -// void InitHierarchicalCtxs(const std::vector &places, -// const std::vector &inter_nccl_ids, -// const std::vector &exter_nccl_ids, -// size_t trainers_num, size_t trainer_id, -// size_t inter_trainers_num, -// size_t exter_trainers_num) { -// PADDLE_ENFORCE_EQ( -// trainers_num, inter_trainers_num * exter_trainers_num, -// platform::errors::InvalidArgument( -// "trainers_num:%llu != inter_trainers_num:%llu * " -// "exter_trainers_num:%llu", -// trainers_num, inter_trainers_num, exter_trainers_num)); - -// PADDLE_ENFORCE_GT( -// inter_trainers_num, 1, -// platform::errors::InvalidArgument( -// "The inter_trainers_num:%llu should be larger than 1.", -// inter_trainers_num)); - -// int inter_trainer_id = trainer_id % inter_trainers_num; -// for (size_t i = 0; i < inter_nccl_ids.size(); i++) { -// VLOG(1) << "init inter_trainer_id:" << inter_trainer_id -// << ", comm no:" << i; -// auto local = new NCCLContextMap(places, inter_nccl_ids[i], -// inter_trainers_num, inter_trainer_id); - -// h_inter_ctxs_.emplace_back(local); -// } - -// int exter_trainer_id = -1; -// if (trainer_id % inter_trainers_num == 0) { -// exter_trainer_id = trainer_id / inter_trainers_num; -// } - -// if (exter_trainer_id >= 0) { -// for (size_t i = 0; i < exter_nccl_ids.size(); i++) { -// auto ex = new NCCLContextMap(places, exter_nccl_ids[i], -// exter_trainers_num, exter_trainer_id); -// VLOG(1) << "init exter_trainer_id:" << exter_trainer_id -// << ", comm no:" << i; -// h_exter_ctxs_.emplace_back(ex); -// } -// } -// } + /* + When nccl inits nccl comm using ncclCommInitAll, it meets error when + allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So + create a new nccl comm for sync_batch_norm_op. And these codes should be + polished with a unified nccl management. + */ -// bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; } + HCCLContextMap *GetSyncBatchNormCtx(framework::Scope* scope, const std::vector &places) { + auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME); + if (hccl_id_var != nullptr) { + return DefaultFlatCtx(); + } -// NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const { -// PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0, -// platform::errors::InvalidArgument( -// "Hierarchical ctxs should be initialized firstly!")); -// return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get(); -// } + if (sync_batch_norm_ctx_.get() == nullptr) { + sync_batch_norm_ctx_.reset(new HCCLContextMap(places)); + } + return sync_batch_norm_ctx_.get(); + } -// NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const { -// PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0, -// platform::errors::InvalidArgument( -// "Hierarchical ctxs should be initialized firstly!")); -// return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get(); -// } + void InitFlatCtxs(const std::vector &places, + const std::vector &hccl_ids, + size_t trainers_num, size_t trainer_id) { + if (hccl_ids.size() == 0) { + auto ptr = new platform::HCCLContextMap(places); + VLOG(1) << "init local trainer"; + flat_ctxs_.emplace_back(ptr); + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto ptr = new platform::HCCLContextMap(places, hccl_ids[i], + trainers_num, trainer_id); + VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i; + flat_ctxs_.emplace_back(ptr); + } + } + + // as Executor have no way to use ncclComm created by ParallelExecutor, + // we assign all flatten contexts to HCCLCommContext to fix. + int nranks = static_cast(trainers_num * places.size()); + int nrings = static_cast(flat_ctxs_.size()); + for (int ring_id = 0; ring_id < nrings; ++ring_id) { + for (size_t p = 0; p < places.size(); ++p) { + int rank = trainer_id * places.size() + p; + int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device; + auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id); + HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank, + dev_id, ring_id); + } + } + } -// std::vector> *GetHierarchicalInterCtxs() { -// return &h_inter_ctxs_; -// } + void InitHierarchicalCtxs(const std::vector &places, + const std::vector &inter_hccl_ids, + const std::vector &exter_hccl_ids, + size_t trainers_num, size_t trainer_id, + size_t inter_trainers_num, + size_t exter_trainers_num) { + PADDLE_ENFORCE_EQ( + trainers_num, inter_trainers_num * exter_trainers_num, + platform::errors::InvalidArgument( + "trainers_num:%llu != inter_trainers_num:%llu * " + "exter_trainers_num:%llu", + trainers_num, inter_trainers_num, exter_trainers_num)); + + PADDLE_ENFORCE_GT( + inter_trainers_num, 1, + platform::errors::InvalidArgument( + "The inter_trainers_num:%llu should be larger than 1.", + inter_trainers_num)); + + int inter_trainer_id = trainer_id % inter_trainers_num; + for (size_t i = 0; i < inter_hccl_ids.size(); i++) { + VLOG(1) << "init inter_trainer_id:" << inter_trainer_id + << ", comm no:" << i; + auto local = new HCCLContextMap(places, inter_hccl_ids[i], + inter_trainers_num, inter_trainer_id); + + h_inter_ctxs_.emplace_back(local); + } + + int exter_trainer_id = -1; + if (trainer_id % inter_trainers_num == 0) { + exter_trainer_id = trainer_id / inter_trainers_num; + } + + if (exter_trainer_id >= 0) { + for (size_t i = 0; i < exter_hccl_ids.size(); i++) { + auto ex = new HCCLContextMap(places, exter_hccl_ids[i], + exter_trainers_num, exter_trainer_id); + VLOG(1) << "init exter_trainer_id:" << exter_trainer_id + << ", comm no:" << i; + h_exter_ctxs_.emplace_back(ex); + } + } + } -// std::vector> *GetHierarchicalExterCtxs() { -// return &h_exter_ctxs_; -// } + bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; } -// protected: -// // Support multi nccl comm on default nccl ring while NCCLContextMap can't. -// std::vector> flat_ctxs_; + HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const { + PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0, + platform::errors::InvalidArgument( + "Hierarchical ctxs should be initialized firstly!")); + return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get(); + } -// // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce. -// // And h_exter_ctxs_ can support multi comm too. -// std::vector> h_inter_ctxs_; -// std::vector> h_exter_ctxs_; + HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const { + PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0, + platform::errors::InvalidArgument( + "Hierarchical ctxs should be initialized firstly!")); + return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get(); + } -// // just used for sync_batch_norm op. -// std::unique_ptr sync_batch_norm_ctx_; -// }; + std::vector> *GetHierarchicalInterCtxs() { + return &h_inter_ctxs_; + } + + std::vector> *GetHierarchicalExterCtxs() { + return &h_exter_ctxs_; + } + + protected: + // Support multi nccl comm on default nccl ring while HCCLContextMap can't. + std::vector> flat_ctxs_; + + // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce. + // And h_exter_ctxs_ can support multi comm too. + std::vector> h_inter_ctxs_; + std::vector> h_exter_ctxs_; + + // just used for sync_batch_norm op. + std::unique_ptr sync_batch_norm_ctx_; +}; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h index 05325aaf9baa1..a7b674d0d0c3f 100644 --- a/paddle/fluid/platform/npu_profiler.h +++ b/paddle/fluid/platform/npu_profiler.h @@ -23,7 +23,17 @@ limitations under the License. */ namespace paddle { namespace platform { -// For ACL 20.1 +#ifdef PADDLE_WITH_ASCEND_STRING +// For CANN 20.2+ +// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats +// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline +// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory +// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory +// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMETIC_UTILIZATION; +#else +// For CANN 20.1 // ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats // ACL_AICORE_PIPELINE = 1, record pipeline // ACL_AICORE_SYNCHRONIZATION = 2, record sync @@ -32,6 +42,7 @@ namespace platform { // ACL_AICORE_STALL = 5, record pipeline ratio constexpr aclprofAicoreMetrics default_metrics = ACL_AICORE_ARITHMATIC_THROUGHPUT; +#endif // ACL_PROF_ACL_API, record ACL API stats // ACL_PROF_TASK_TIME, record AI core stats diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py index 4cda0ceeccf9c..ac80ea4c62cbf 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py @@ -14,6 +14,8 @@ import unittest import numpy as np +import sys +sys.path.append("..") from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index 400ddd9d4aab0..2463ddb7137ac 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -41,7 +41,7 @@ def setUp(self): vocab = 10 dim = 20 w = np.ones([vocab, dim]).astype(self.dtype) - x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64) + x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32) out = np.ones([bsz, seqlen, dim]).astype(self.dtype) self.inputs = { diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index e65a3dac73928..4fcfd33b32f4e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -248,8 +248,9 @@ def test_npu(self): cpu_pred, cpu_loss = self._test(False) npu_pred, npu_loss = self._test(True) - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + self.assertTrue(np.allclose( + npu_pred, cpu_pred, atol=1e-5)) # atol needed on cann 20.3 + self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5)) @unittest.skipIf(not paddle.is_compiled_with_npu(), diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py index 087256b298088..583a648224d73 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py @@ -16,6 +16,8 @@ import unittest import numpy as np +import sys +sys.path.append("..") from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid.core as core diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py index 44515ce2e5b94..ed21549b7e01f 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py @@ -36,7 +36,7 @@ def setUp(self): self.op_type = "assign" self.init_dtype() - x = np.rand.random([3,3]) + x = np.random.random([3, 3]).astype(self.dtype) self.inputs = {'X': x} self.attrs = {} @@ -46,7 +46,7 @@ def set_npu(self): self.__class__.use_npu = True def init_dtype(self): - self.dtype = np.int64 + self.dtype = np.float32 def test_check_output(self): self.check_output_with_place(self.place, check_dygraph=False) @@ -54,4 +54,3 @@ def test_check_output(self): if __name__ == '__main__': unittest.main() -