parent 78959a3

author Leo Chen <[email protected]> 1617782111 +0800 committer liuwei88 <[email protected]> 1617789509 +0800 [NPU] support cann 20.3 (PaddlePaddle#32044) * fix compile problem on cann 20.3 * fix ut * fix test_mul * fix check_finite_and_scale * fix lookup_table_v2_grad * fix cmake * support print op add reduce op add more tests add padding set add reduce sum op and try to pr add reduce sum op and try to pr 2 test all op successful in 28 but failed in 24(broadcast and reducescatter) add send/recv op change some file type
lw921014 · Apr 7, 2021 · 3701d55 · 3701d55
1 parent 78959a3
commit 3701d55
Show file tree

Hide file tree

Showing 44 changed files with 1,747 additions and 833 deletions.
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
@@ -21,6 +21,11 @@ else()
     set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
 if(WITH_ASCEND)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -43,9 +48,7 @@ if(WITH_ASCEND)
   set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
   INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
 
-  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-  endif()
+
 
   ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type,
   // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
   // Inputs, outputs and attrs will be set to empty map
   // to improve the execution efficiency of dygraph.
+
   if (inputs_.size() > 0 || outputs_.size() > 0) {
     GenerateTemporaryNames();
     CheckAllInputOutputSet();

diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
@@ -31,6 +31,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -45,6 +50,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -159,7 +159,6 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
   cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
 endif()
 

diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -61,7 +61,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
 
     size_t x_size = xs.size();
     for (size_t i = 0; i < x_size; ++i) {
-      found_inf_data = true;
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
@@ -77,6 +76,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
             NpuOpRunner("CheckNumerics", {*x}, {check_xout},
                         {{"message", std::string("check_nan_and_inf")}});
         runner_checknumerics.Run(stream);
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
       } catch (platform::EnforceNotMet& exception) {
         LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
         found_inf_data = true;

diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -110,10 +110,10 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
   // out found_inf
   Tensor found_inf_tensor;
   found_inf_tensor.Resize({1});
-  bool *is_finite_data =
+  bool *found_inf_data =
       found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
   f::TensorCopy(*found_inf, place, &found_inf_tensor);
-  EXPECT_FALSE(*is_finite_data);
+  EXPECT_TRUE(*found_inf_data);
 
   ctx.Wait();
 }

diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -24,39 +24,43 @@ if(WITH_GLOO)
 endif()
 
 if(WITH_XPU_BKCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common)
     op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 if(WITH_ASCEND_CL)
-set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
 
 if(WITH_ASCEND_CL)
-    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
         dynamic_loader dynload_warpctc scope device_context enforce executor)
     cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
         DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
         DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
-        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
-            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
         DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
         DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
         DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
         DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc 
-        DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc 
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
         DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 endif()
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
     auto place = ctx.GetPlace();
     auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
     int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
 
     framework::DDim out_dims = in->dims();
     out_dims[0] *= nranks;
     out->mutable_data<T>(out_dims, place);
 
-    int64_t send_numel = in->numel();
+    uint64_t send_numel = in->numel();
     void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
     void *recv_buff = reinterpret_cast<void*>(out->data<T>());
 
@@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
     VLOG(3) << "begin hccl allgather, parameter is: "
       << ", group is " << group
       << ", ring_id is " << ring_id
-      << ", nranks is " << nranks
-      << ", tag is " << tag;
+      << ", nranks is " << nranks;
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
-        tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
-        group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype,
+        comm->comm(), (void*)stream));
 
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(

diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 
 DECLARE_string(selected_npus);
@@ -56,41 +58,81 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
   VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id  
-  << "; rank_id = " << rank_id  
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));  
-  
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
 }
 
 void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  
+
   int num1 = 1;
   int num2 = 4;
 
@@ -104,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -115,15 +157,15 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["tag"]=std::string("tagx");
   attrs["ring_id"]=0;
   attrs["nranks"]=2;
-  
-  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);
 
   for (int i = 0; i < 10; i ++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
-  
+
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
@@ -142,10 +184,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(c_allgather, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
-  // only support one device, if more than one device, use first default  
+  // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   TestHCCLAllGatherOp(&scope, ctx);
 }