diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index f46c5bf7ac09d..ed98ca60e4d83 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,6 +21,11 @@ else()
     set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
 if(WITH_ASCEND)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
@@ -43,9 +48,7 @@ if(WITH_ASCEND)
   set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
   INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
 
-  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-  endif()
+
 
   ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 665e7b2fcf8bc..c5fe08ab78224 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type,
   // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
   // Inputs, outputs and attrs will be set to empty map
   // to improve the execution efficiency of dygraph.
+
   if (inputs_.size() > 0 || outputs_.size() > 0) {
     GenerateTemporaryNames();
     CheckAllInputOutputSet();
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 2fd4de5cfcba4..e2c4bdd978e8f 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -31,6 +31,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -45,6 +50,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6fe18f2479478..2d3550f8f0636 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -159,7 +159,6 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
   cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op scope device_context enforce executor compare_op)
 endif()
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 46f9f7ff08944..3db458050255e 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -61,7 +61,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
 
     size_t x_size = xs.size();
     for (size_t i = 0; i < x_size; ++i) {
-      found_inf_data = true;
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
@@ -77,6 +76,8 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
             NpuOpRunner("CheckNumerics", {*x}, {check_xout},
                         {{"message", std::string("check_nan_and_inf")}});
         runner_checknumerics.Run(stream);
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
       } catch (platform::EnforceNotMet& exception) {
         LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
         found_inf_data = true;
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index 99e81a4757d0e..1ed188b15939e 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -110,10 +110,10 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
   // out found_inf
   Tensor found_inf_tensor;
   found_inf_tensor.Resize({1});
-  bool *is_finite_data =
+  bool *found_inf_data =
       found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
   f::TensorCopy(*found_inf, place, &found_inf_tensor);
-  EXPECT_FALSE(*is_finite_data);
+  EXPECT_TRUE(*found_inf_data);
 
   ctx.Wait();
 }
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 93caa5354dbc4..1a8ac46b637f4 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -24,39 +24,43 @@ if(WITH_GLOO)
 endif()
 
 if(WITH_XPU_BKCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common)
     op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 if(WITH_ASCEND_CL)
-set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
 
 if(WITH_ASCEND_CL)
-    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
         dynamic_loader dynload_warpctc scope device_context enforce executor)
     cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
         DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
         DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
-        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
-            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
         DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
         DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
         DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
         DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc 
-        DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc 
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
         DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 endif()
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
index ea6caf954d14c..f142367e9f1be 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
     auto place = ctx.GetPlace();
     auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
     int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
 
     framework::DDim out_dims = in->dims();
     out_dims[0] *= nranks;
     out->mutable_data<T>(out_dims, place);
 
-    int64_t send_numel = in->numel();
+    uint64_t send_numel = in->numel();
     void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
     void *recv_buff = reinterpret_cast<void*>(out->data<T>());
 
@@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
     VLOG(3) << "begin hccl allgather, parameter is: "
       << ", group is " << group
       << ", ring_id is " << ring_id
-      << ", nranks is " << nranks
-      << ", tag is " << tag;
+      << ", nranks is " << nranks;
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
-        tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
-        group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype,
+        comm->comm(), (void*)stream));
 
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index 38f19170af958..922bfe868ebbb 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 
 DECLARE_string(selected_npus);
@@ -56,28 +58,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
   VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id  
-  << "; rank_id = " << rank_id  
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));  
-  
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
@@ -85,12 +127,12 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
 void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  
+
   int num1 = 1;
   int num2 = 4;
 
@@ -104,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -115,15 +157,15 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["tag"]=std::string("tagx");
   attrs["ring_id"]=0;
   attrs["nranks"]=2;
-  
-  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);
 
   for (int i = 0; i < 10; i ++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
-  
+
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
@@ -142,10 +184,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(c_allgather, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
-  // only support one device, if more than one device, use first default  
+  // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   TestHCCLAllGatherOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 3fdc859506754..14f1cb4dcbda5 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
 
 DECLARE_string(selected_npus);
@@ -59,7 +61,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
   VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
@@ -67,17 +69,57 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   VLOG(2) << "rank_id = " << rank_id
   << "; device_id = " << device_id
   << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
 
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
@@ -85,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   std::vector<float> init;
@@ -104,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -115,8 +157,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["tag"]=std::string("tagx");
   attrs["ring_id"]=0;
 
-  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);
 
   for (int i = 0; i < 10; i ++) {
     op->Run(*scope, place);
@@ -137,10 +179,12 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(c_allreduce_max, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
   // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   TestHCCLAllReduceOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 0011d9e9ad83a..a770096e43110 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -117,34 +117,18 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
-
     auto in = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
     auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
     int64_t numel = in->numel();
 
-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
-
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
-
-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
 
     int ring_id = ctx.Attr<int>("ring_id");
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
     auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
 
     aclrtStream stream = nullptr;
     auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@@ -154,33 +138,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       stream = comm->stream();
     }
 
-    // we need to memset this memory firstly to avoid core by hccl
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
     switch (red_type) {
       case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
         break;
 
       case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
         break;
 
       case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
         break;
 
       case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
         break;
 
       default:
@@ -192,16 +165,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       << "input num: " << numel
       << "dtype: " << dtype
       << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
-      << ", tag is " << tag;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+      << ", group is: " << group;
 
-    memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
-                 npu_place, recvbuff,
-                 numel * sizeof(T),
-                 stream);
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));
 
     out->Resize(in->dims());
 #else
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 2fff84593c008..600d85743283b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
 
 DECLARE_string(selected_npus);
@@ -56,7 +58,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
   VLOG(3) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
@@ -64,17 +66,57 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   VLOG(2) << "rank_id = " << rank_id
   << "; device_id = " << device_id
   << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
 
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
@@ -82,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   int rank_id = atoi(getenv("RANK_ID"));
@@ -101,7 +143,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter)
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -113,8 +155,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter)
   attrs["ring_id"]=0;
 
   auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}},
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
                                     attrs);
 
   for (int i = 0; i < 10; i ++) {
@@ -136,11 +178,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter)
 
 TEST(c_allreduce_sum, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
   // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   for(int i = 0; i < 1; i ++){
     VLOG(2) << "iter num: " << i;
     TestHCCLAllReduceOp(&scope, ctx, i);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index 67410a1c70426..9dd9a31610985 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -30,7 +30,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     auto x = ctx.Input<framework::LoDTensor>("X");
     void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
     int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
 
     auto out = ctx.Output<framework::LoDTensor>("Out");
 
@@ -48,14 +48,12 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
 
     int root = ctx.Attr<int>("root");
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
 
     VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
-      << ", group is " << group
-      << ", tag is " << tag;
+      << ", group is " << group << ", comm: " << comm->comm() << ", stream: " << stream;
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
-                                  dtype, (uint32_t)root, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));
 
     VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
             << framework::product(out->dims());
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 66158e5ff28ae..6e82e6c0bfde7 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 
 DECLARE_string(selected_npus);
@@ -53,28 +55,68 @@ void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
   VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id  
-  << "; rank_id = " << rank_id  
-  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));  
-  
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
@@ -82,12 +124,12 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
 void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
   int num = 2;
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  
+
   for (int64_t i = 0; i < num * num; ++i) {
     init.push_back(1.0 + rank_id);
   }
@@ -98,7 +140,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num, num});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -110,14 +152,14 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["root"]=0;
   attrs["ring_id"]=0;
 
-  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);
 
   for (int i = 0; i < 10; i ++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
-  
+
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
@@ -131,10 +173,11 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(c_broadcast, NPU) {
   f::Scope scope;
-
-  // only support one device, if more than one device, use first default  
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   TestHCCLBroadcastOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
similarity index 62%
rename from paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
rename to paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
index f720ffdd0fe8c..1677ac5bcb01d 100644
--- a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,66 +12,67 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/hccl_helper.h"
-
 #include <string>
-#include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
-class CCommInitOpNPU : public framework::OperatorBase {
+class CCommInitOpAscend : public framework::OperatorBase {
  public:
-  CCommInitOpNPU(const std::string& type,
-              const framework::VariableNameMap& inputs,
+  CCommInitOpAscend(const std::string& type, const framework::VariableNameMap& inputs,
               const framework::VariableNameMap& outputs,
               const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
-    int rid = Attr<int>("ring_id");
-    int nranks = Attr<int>("nranks");
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
     int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
     int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
     if (Attr<int>("device_id") >= 0) {
       device_id = Attr<int>("device_id");
     }
-    std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
-
-    VLOG(3) << "begin c_comm_init on npu, parameters are: "
-            << "ring id[" << rid
-            << "], nranks[" << nranks
-            << "], rank_id[" << rank_id
-            << "], device_id[" << device_id
-            << "]";
-
     platform::HCCLCommContext::Instance().CreateHCCLComm(
-        rank_ids, rank_id, device_id, rid);
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
   }
 };
 
-class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
     AddComment(R"DOC(
-CCommInit operator on NPU
+CCommInit operator
 
-Initialize collective communication context within this trainer
+Initialize collective communicatoin context within this trainer
 )DOC");
-    AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
-    AddAttr<std::vector<int>>("rank_ids", "The world rank ids of the group");
+    AddAttr<int>("rank_ids", "(int) The number of ranks of distributed trainers");
     AddAttr<int>("rank",
                  "(int) The rank of the trainer in distributed training.");
     AddAttr<int>("device_id",
@@ -89,6 +90,4 @@ Initialize collective communication context within this trainer
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_comm_init_hcom, ops::CCommInitOpNPU, ops::CCommInitOpNPUMaker);
-
-#endif
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..78f1076f7eac4
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+  }
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 9446c38dcba32..915bf8391992d 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -63,6 +63,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 };
 
 #else
+
 class CGenNCCLIdOp : public framework::OperatorBase {
  public:
   CGenNCCLIdOp(const std::string& type,
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 4e9d804fc36ea..a8df0512dc59a 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -121,31 +121,15 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
-
     auto in = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
     auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
     int64_t numel = in->numel();
 
-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
-
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
 
-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
-
-    std::string tag = ctx.Attr<std::string>("tag");
     int ring_id = ctx.Attr<int>("ring_id");
     int root_id = ctx.Attr<int>("root_id");
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
@@ -161,33 +145,22 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
 
     int rank_id = comm->rank();
 
-    // we need to memset this memory firstly to avoid core by hccl
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
     switch (red_type) {
       case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
         break;
 
       case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
         break;
 
       case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
         break;
 
       case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
         break;
 
       default:
@@ -200,18 +173,14 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
       << "root_id: " << root_id
       << "dtype: " << dtype
       << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
-      << ", tag is " << tag;
+      << ", group is: " << group;
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));
 
-    if(rank_id == root_id){
-      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
-                  npu_place, recvbuff,
-                  numel * sizeof(T),
-                  stream);
-    }else{
+
+    if(rank_id != root_id){
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
       memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
             npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
             numel * sizeof(T),
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index 36ec6d155a214..6f99ec5b49896 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
 
 DECLARE_string(selected_npus);
@@ -56,7 +58,7 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
   VLOG(3) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
@@ -64,17 +66,57 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   VLOG(2) << "rank_id = " << rank_id
   << "; device_id = " << device_id
   << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
 
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
@@ -82,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
 void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   int rank_id = atoi(getenv("RANK_ID"));
@@ -101,7 +143,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
   tensor_x->Resize({num1, num2});
   ctx.Wait();
 
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -115,8 +157,8 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
   attrs["root_id"]=root_id;
 
   auto op = f::OpRegistry::CreateOp("c_reduce_sum",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}},
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
                                     attrs);
 
   op->Run(*scope, place);
@@ -141,11 +183,13 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
 
 TEST(c_reduce_sum, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
   // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   for(int i = 0; i < 2; i ++){
     VLOG(2) << "iter num: " << i;
     TestHCCLReduceOp(&scope, ctx, i);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
index e38bcdea27da7..22769ae3d2e96 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -35,7 +35,6 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
     int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
 
     auto out_dims = in->dims();
     PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
@@ -47,11 +46,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
     out_dims[0] = out_dims[0] / nranks;
     out->mutable_data<T>(out_dims, place);
 
-    int64_t recv_numel = in->numel() / nranks;
+    uint64_t recv_numel = in->numel() / nranks;
 
     void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
     void* outputPtr = reinterpret_cast<void*>(out->data<T>());
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
 
     aclrtStream stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
@@ -63,12 +62,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
     VLOG(3) << "begin hccl reduce scatter, parameter is: "
       << "recv_numel: " << recv_numel
       << "dtype: " << dtype
-      << "hccl_red_type: " << HCCL_REP_OP_SUM
-      << ", group is: " << group
-      << ", tag is " << tag;
+      << "hccl_red_type: " << HCCL_REDUCE_SUM
+      << ", group is: " << group;
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter(
-        tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), (void*)stream));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with NPU."));
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 1c21ab19b954c..9eb5813174345 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
 
 DECLARE_string(selected_npus);
@@ -59,7 +61,7 @@ void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
   VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
@@ -70,22 +72,63 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
 
   std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
   comm_init_attrs["rank"] = rank_id;
   comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
   auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
   auto place = ctx.GetPlace();
   comm_init_op->Run(*scope, place);
   ctx.Wait();
 }
 
+
 void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
 
   std::vector<float> init;
@@ -103,7 +146,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
   tensor_out->mutable_data<float>(place);  // allocate
@@ -116,14 +159,14 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["ring_id"]=0;
   attrs["nranks"]=2;
 
-  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);
 
   int iter_num = 10;
   for (int i = 0; i < iter_num; i ++) {
     op->Run(*scope, place);
+    ctx.Wait();
   }
-  ctx.Wait();
 
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
@@ -132,16 +175,18 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
   PrintDebugInfo("output data", out_vec);
   EXPECT_EQ(out_vec.size(), init.size() / 2);
   for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], iter_num + 1);
+    EXPECT_EQ(out_vec[i], 2.0);
   }
 }
 
 TEST(c_reducescatter, NPU) {
   f::Scope scope;
+  HcclRootInfo hccl_id;
 
   // only support one device, if more than one device, use first default
   p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
   TestHCCLReduceScatterOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index 371bcc4cfcfef..44e837c66ad61 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -41,7 +41,7 @@ namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_sync_comm_stream);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 
 void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..fe247df81fcd4
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -0,0 +1,217 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
new file mode 100644
index 0000000000000..c565ea4ae027f
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -0,0 +1,349 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
new file mode 100644
index 0000000000000..1ad6f791e1fc3
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
index 80dad82386d1f..0db2bd3298302 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,32 +27,39 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto out = ctx.Output<framework::LoDTensor>("Out");
-    int numel = out->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
 
     aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    int srcRank = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");
-    VLOG(3) << "recv_v2_npu attr get";
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), (u64)numel, dtype, srcRank,
-          srTag, group.c_str(), stream));
-     VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
-    out->Resize(out->dims());
-    out->set_lod(out->lod());
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                platform::errors::InvalidArgument(
+                    "The nranks must be 2, but (%d)",
+                      nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with NPU."));
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 727d8be5a8f9a..22b90920e2891 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,30 +44,72 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(recv_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(recv_v2, NPU);
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
 
-    std::string rank_table_file = getenv("RANK_TABLE_FILE");
-    int rank_id = atoi(getenv("RANK_ID"));
-    int device_id = atoi(getenv("DEVICE_ID"));
-    int src_rank = atoi(getenv("SRC_RANK"));
-    int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
-
-    std::vector<int> rank_ids = {0,1};
-    f::AttributeMap comm_init_attrs;
-    comm_init_attrs["ring_id"] = 0;
-    comm_init_attrs["nranks"] = 2;
-    comm_init_attrs["rank"] = rank_id;
-    comm_init_attrs["device_id"] = device_id;
-    comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-    VLOG(3) << "CreateOp c_comm_init_hcom";
-    auto place = ctx.GetPlace();
-    comm_init_op->Run(*scope, place);
-    ctx.Wait();
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
 }
 
 void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
@@ -79,7 +123,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
 
     ctx.Wait();
     auto place = ctx.GetPlace();
-    auto out = scope->Var("Out");
+    auto out = scope->Var("Data");
     auto tensor_out = out->GetMutable<f::LoDTensor>();
     tensor_out->Resize({num, num});
     tensor_out->mutable_data<float>(place);  // allocate
@@ -96,7 +140,7 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
     out_shape.push_back(num);
     attrs["out_shape"]=out_shape;
 
-    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
+    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
     VLOG(3) << "CreateOp recv_v2";
 
     for (int i = 0; i < 10; i ++) {
@@ -113,12 +157,13 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
 
 TEST(recv_v2, NPU){
     f::Scope scope;
+    HcclRootInfo hccl_id;
+
     char * npu_id=getenv("FLAGS_selected_npus");
     VLOG(3) << "Select npu:" << npu_id;
     p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-    VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
-    VLOG(3) << "Prepare over";
+
+    PrepareUniqueId(&scope, ctx, &hccl_id);
+    Prepare(&scope, ctx, &hccl_id);
     TestHcomRecvOp(&scope, ctx);
-    VLOG(3) << "Test over";
 }
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
index 15fcec269c569..621c9ffe4cd17 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -28,31 +28,37 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
     auto x = ctx.Input<framework::LoDTensor>("X");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
     int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
 
-    auto place = ctx.GetPlace();
     int ring_id = ctx.Attr<int>("ring_id");
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto place = ctx.GetPlace();
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
 
     aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    int destRank = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");
 
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), (u64)numel, dtype, destRank,
-          srTag, group.c_str(), stream));
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                  platform::errors::InvalidArgument(
+                      "The nranks must be 2, but (%d)",
+                       nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;
 
-      VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
-              << x->numel();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));
 
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index 7916d155ee761..3f47870b841c9 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -41,34 +42,78 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(send_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(send_v2, NPU);
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
-    std::string rank_table_file = getenv("RANK_TABLE_FILE");
-    int rank_id = atoi(getenv("RANK_ID"));
-    int device_id = atoi(getenv("DEVICE_ID"));
-    int src_rank = atoi(getenv("SRC_RANK"));
-    int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
-
-    std::vector<int> rank_ids = {0, 1};
-    f::AttributeMap comm_init_attrs;
-    comm_init_attrs["ring_id"] = 0;
-    comm_init_attrs["nranks"] = 2;
-    comm_init_attrs["rank"] = rank_id;
-    comm_init_attrs["device_id"] = device_id;
-    comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-    auto place = ctx.GetPlace();
-    comm_init_op->Run(*scope, place);
-    ctx.Wait();
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
 }
 
 void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
     std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
-    auto x = scope->Var("X");
+    auto x = scope->Var("Data");
     auto tensor_x = x->GetMutable<f::LoDTensor>();
     int num = atoi(getenv("DATA_SIZE"));;
     EXPECT_GT(num, 0);
@@ -88,24 +133,24 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
     attrs["ring_id"]=0;
     attrs["srTag"]=0;
 
-    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
-    
+    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
     for (int i = 0; i < 10; i ++) {
       op->Run(*scope, place);
     }
     VLOG(3)<<"send run over";
-    ctx.Wait();    
+    ctx.Wait();
 }
 
 TEST(send_v2, NPU){
     f::Scope scope;
+    HcclRootInfo hccl_id;
+
     char * npu_id=getenv("FLAGS_selected_npus");
     VLOG(3) << "Select npu:" << npu_id;
     p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-    VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
-    VLOG(3) << "Prepare over";
-    TestHcomSendOp(&scope, ctx);
-    VLOG(3) << "Test over";
 
+    PrepareUniqueId(&scope, ctx, &hccl_id);
+    Prepare(&scope, ctx, &hccl_id);
+    TestHcomSendOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 4516aa38fb39b..320b498156b3f 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -28,6 +28,12 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
     auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
     auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
+    // It seems cann 20.1 accepts int64, but cann 20.2+ not.
+    PADDLE_ENFORCE_EQ(ids_t->type(), framework::proto::VarType::INT32,
+                      platform::errors::Unimplemented(
+                          "The index of LookupTableV2 should be int32."));
+
     auto *table_var = ctx.InputVar("W");
     PADDLE_ENFORCE_EQ(
         table_var->IsType<framework::LoDTensor>(), true,
@@ -49,28 +55,26 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+
     auto *output_grad_t =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto *table_grad_t =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
+    auto *p = table_grad_t->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // step2: ZerosLike x in device
-    Tensor zeroslike_w(table_grad_t->type());
-    zeroslike_w.Resize(table_grad_t->dims());
-    auto p = zeroslike_w.mutable_data<T>(ctx.GetPlace());
-
     platform::NPUMemsetAsync(static_cast<void *>(p), 0,
-                             zeroslike_w.numel() * sizeof(T), stream);
+                             table_grad_t->numel() * sizeof(T), stream);
 
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
+    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+    // can be different tensor, but in cann 20.2+, it does inplace operation.
+    // Thus, the first input and output should be same tensor.
     auto runner_scatter =
-        NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {});
+        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {{"use_locking", true}});
     runner_scatter.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
deleted file mode 100644
index f37915834bd75..0000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <cmath>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-
-USE_OP(lookup_table_v2);
-USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto ids = scope->Var("Ids");
-  auto out = scope->Var("Out");
-  auto w = scope->Var("W");
-
-  auto ids_t = ids->GetMutable<f::LoDTensor>();
-  auto out_t = out->GetMutable<f::LoDTensor>();
-  auto w_t = w->GetMutable<f::LoDTensor>();
-  int bsz = 10;
-  int dim = 32;
-  int seqlen = 8;
-  int vocab_size = 100;
-  TensorFromVector(std::vector<int64_t>(bsz * seqlen, 3), ctx, ids_t);
-  std::vector<T> val(vocab_size * dim, 10.);
-  TensorFromVector(val, ctx, w_t);
-  ids_t->Resize({bsz, seqlen});
-  w_t->Resize({vocab_size, dim});
-  out_t->Resize({bsz, seqlen, dim});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  out_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{}};
-  auto op = f::OpRegistry::CreateOp("lookup_table_v2",
-                                    {{"W", {"W"}}, {"Ids", {"Ids"}}},
-                                    {{"Out", {"Out"}}}, attrs);
-  op->Run(*scope, place);
-  std::vector<T> out_v;
-  TensorToVector(*out_t, ctx, &out_v);
-  ctx.Wait();
-  EXPECT_EQ(out_t->numel(), bsz * seqlen * dim);
-  T res = std::accumulate(out_v.begin(), out_v.end(), 0.);
-  float eps = 1.e-6;
-  EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps);
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto w = scope->Var("W");
-  auto ids = scope->Var("Ids");
-  auto out = scope->Var("DOut");
-  auto dw = scope->Var("DW");
-
-  auto w_t = w->GetMutable<f::LoDTensor>();
-  auto ids_t = ids->GetMutable<f::LoDTensor>();
-  auto out_t = out->GetMutable<f::LoDTensor>();
-  auto dw_t = dw->GetMutable<f::LoDTensor>();
-
-  int bsz = 2;
-  int dim = 2;
-  int seqlen = 2;
-  int vocab_size = 4;
-
-  std::vector<int64_t> val_int(bsz * seqlen, 3);
-  std::vector<T> val(vocab_size * dim, 0.);
-  std::vector<T> val_out(bsz * seqlen * dim, 1.);
-
-  TensorFromVector(val_int, ctx, ids_t);
-  TensorFromVector(val, ctx, w_t);
-  TensorFromVector(val, ctx, dw_t);
-  TensorFromVector(val_out, ctx, out_t);
-
-  w_t->Resize({vocab_size, dim});
-  ids_t->Resize({bsz, seqlen});
-  out_t->Resize({bsz, seqlen, dim});
-  dw_t->Resize({vocab_size, dim});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  out_t->mutable_data<T>(place);
-  w_t->mutable_data<T>(place);
-  dw_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{}};
-  auto op = f::OpRegistry::CreateOp(
-      "lookup_table_v2_grad",
-      {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}},
-      {{"W@GRAD", {"DW"}}}, attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> w_v;
-  TensorToVector(*dw_t, ctx, &w_v);
-  ctx.Wait();
-  EXPECT_EQ(dw_t->numel(), vocab_size * dim);
-  T res = std::accumulate(w_v.begin(), w_v.end(), 0.);
-  float eps = 1.e-6;
-  EXPECT_LT(fabs(res - bsz * seqlen * dim), eps);
-}
-
-TEST(lookup_table_v2, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
-}
-
-TEST(lookup_table_v2_grad, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index e4fa4a96a5cf7..5bce5719d7cea 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
     framework::LoDTensor cpu_tensor;
     platform::CPUPlace cpu_place;
     TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(print_tensor.place())) {
+      platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
+    }
+#endif
     data = cpu_tensor.data<T>();
   }
 
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 8a205fdf0b787..194f3e3332c11 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -157,15 +157,10 @@ class HCCLComm {
   virtual int nranks() const = 0;
   virtual int rank() const = 0;
   virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
   virtual aclrtStream stream() const = 0;
   virtual NPUDeviceContext* dev_context() const = 0;
   virtual ~HCCLComm() = default;
-
-  unsigned long NextTagId() {
-    return tag_counter_++;
-  }
- private:
-  std::atomic<unsigned long> tag_counter_;
 };
 
 // A singleton HCCL communicator context reserves communication ring ids
@@ -176,11 +171,12 @@ class HCCLCommContext {
     return comm_ctx;
   }
 
-  HCCLComm* CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id,  int ring_id = 0);
-
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                          int rank, int dev_id, int ring_id);
   // a latter comm with the same dev_id and the same ring_id
   // will override the former
-  HCCLComm* AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id = 0);
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id);
 
   // retrieve a communicator by the ring id in multiprocessing mode
   HCCLComm* Get(int ring_id) const {
@@ -217,20 +213,21 @@ class HCCLCommContext {
 
  private:
   // Init global hcom
-  HCCLCommContext() { InitHcomWorldGroup(); }
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
 
+  HcclComm comm_;
 
 public:
-  ~HCCLCommContext(){
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_destroy());
-  }
+  ~HCCLCommContext(){ }
 
   std::once_flag once_flag_;
   std::mutex comm_map_mutex_;
   // ring id to dev-HCCLComm
   std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
 
-  void InitHcomWorldGroup();
+  // void InitHcomWorldGroup();
   void ReleaseHCCLComms();
 
   DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
index 56cfdec4b5e70..4efa54e774aa3 100644
--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -34,6 +34,13 @@ class HCCLCommImpl : public HCCLComm {
     return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
   }
 
+  ~HCCLCommImpl(){
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
   aclrtStream stream() const override { return dev_ctx_->stream(); }
 
   void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
@@ -45,46 +52,43 @@ class HCCLCommImpl : public HCCLComm {
   int ring_id_;
   int nranks_;
   int rank_;
+  HcclComm comm_;
   std::unique_ptr<NPUDeviceContext> dev_ctx_;
 };
 
-HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
   PADDLE_ENFORCE_GT(
-      world_rank_ids.size(), 1,
+      nranks, 1,
       platform::errors::InvalidArgument(
-          "Expected world_rank_ids.size() > 1. But received size is %d.", world_rank_ids.size()));
+          "Expected nranks > 1. But received nranks is %d.", nranks));
   PADDLE_ENFORCE_GE(rank, 0,
                     platform::errors::InvalidArgument(
                         "Expected rank >= 0. But received rank is %d.", rank));
   PADDLE_ENFORCE_LT(
-      rank, world_rank_ids.size(),
+      rank, nranks,
       platform::errors::InvalidArgument(
           "Expected rank < nranks. But received rank is %d, nranks is %d.",
-          rank, world_rank_ids.size()));
+          rank, nranks));
   PADDLE_ENFORCE_GE(
       dev_id, 0,
       platform::errors::InvalidArgument(
           "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
-  PADDLE_ENFORCE_GE(
-      ring_id, 0,
-      platform::errors::InvalidArgument(
-          "Expected ring_id >= 0. But received ring_id is %d.", ring_id));
-
-  auto* comm_wrapper = AssignHCCLComm(world_rank_ids.size(), rank, dev_id, ring_id);
 
-  // HACK(sunpeng17): hcom API requires bind stream to a model
-  // but we don't need model in Paddle, so we feed stream pointer as model pointer
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
   PADDLE_ENFORCE_NPU_SUCCESS(
-      platform::dynload::hcom_bind_model(comm_wrapper->stream(),
-                                         comm_wrapper->stream()));
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+ VLOG(1) << "initialized comm: " << &comm  << ", nranks: " << nranks << ", hccl_id: " << hccl_id << ", rank: " << rank;
 
-  // Get world_rank_ids registered in gen_nccl_id op
-  std::string group_name = HCOM_GROUP_PREFIX + std::to_string(ring_id);
-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_create_group(
-      group_name.c_str(), world_rank_ids.size(), (unsigned int*)world_rank_ids.data()));
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
 
   VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
-          << " has been created on device " << dev_id << ", group name: " << group_name;
+          << " has been created on device " << dev_id << ", with comm: " << comm_wrapper->comm();
 
   std::call_once(once_flag_, []() {
     std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
@@ -93,7 +97,8 @@ HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids
   return comm_wrapper;
 }
 
-HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
   std::unique_ptr<NPUDeviceContext> dev_ctx(
       new NPUDeviceContext(NPUPlace(dev_id)));
 
@@ -101,6 +106,7 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
   c->set_ring_id(ring_id);
   c->set_nranks(nranks);
   c->set_rank(rank);
+  c->set_comm(comm);
   c->set_dev_ctx(std::move(dev_ctx));
 
   comm_map_mutex_.lock();
@@ -112,23 +118,14 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
   dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
   comm_map_mutex_.unlock();
 
-  return comm_map_[ring_id][dev_id].get();
-}
-
-void HCCLCommContext::InitHcomWorldGroup() {
-  const char *rank_table_file = getenv(ENV_RANK_TABLE_FILE);
-  PADDLE_ENFORCE_NOT_NULL(
-      rank_table_file,
-      platform::errors::InvalidArgument("The RANK_TABLE_FILE environment variable should not be null."));
-
-  const char *rank_id = getenv(ENV_RANK_ID);
-  PADDLE_ENFORCE_NOT_NULL(
-      rank_id,
-      platform::errors::InvalidArgument("The RANK_ID environment variable should not be null."));
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }
 
-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_init(rank_table_file, rank_id));
-  VLOG(3) << "Successfully initialized hcom. rank_table_file: "
-    << rank_table_file << ", rank_id " << rank_id;
+  return comm_map_[ring_id][dev_id].get();
 }
 
 void HCCLCommContext::ReleaseHCCLComms() {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index bf34e57b773fa..da602bfd5290e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -185,11 +185,21 @@ class NPUDeviceContext : public DeviceContext {
 
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
+#endif
+
  private:
   NPUPlace place_;
   aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif
 
   // Need to be the same with other DeviceContext,
diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h
index 2ad62213415c9..328a4e0e78d31 100644
--- a/paddle/fluid/platform/dynload/hccl.h
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-// #include <hccl/hccl.h>
-// #include <hccl/hccl_types.h>
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+// #include "paddle/fluid/platform/dynload/hcom.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -43,27 +45,14 @@ extern void* hccl_dso_handle;
   extern DynLoad__##__name __name
 
 #define HCCL_RAND_ROUTINE_EACH(__macro)         \
-  __macro(hcom_init);                           \
-  __macro(hcom_destroy);                        \
-  __macro(hcom_bind_model);                     \
-  __macro(hcom_unbind_model);                   \
-  __macro(hcom_send);                           \
-  __macro(hcom_receive);                        \
-  __macro(hcom_broadcast);                      \
-  __macro(hcom_all_gather);                     \
-  __macro(hcom_all_reduce);                     \
-  __macro(hcom_reduce_scatter);                 \
-  __macro(hcom_create_group);                   \
-  __macro(hcom_destroy_group);                  \
-  __macro(hcom_get_rank_id);                    \
-  __macro(hcom_get_local_rank_id);              \
-  __macro(hcom_get_local_rank_size);            \
-  __macro(hcom_get_split_strategy);             \
-  __macro(hcom_set_split_strategy_by_size);     \
-  __macro(hcom_set_split_strategy_by_index);    \
-  __macro(hcom_get_group_rank_from_world_rank); \
-  __macro(hcom_get_world_rank_from_group_rank); 
-
+  __macro(HcclReduceScatter);                   \
+  __macro(HcclCommDestroy);                     \
+  __macro(HcclAllReduce);                       \
+  __macro(HcclCommInitRootInfo);                \
+  __macro(HcclGetRootInfo);                     \
+  __macro(HcclBroadcast);                       \
+  __macro(HcclCommInitClusterInfo);             \
+  __macro(HcclAllGather);
 
 HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 85d255a1f91c1..a3fc148265e8c 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -40,7 +40,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL
 
 #include <fstream>
@@ -1013,7 +1013,7 @@ struct NPUStatusType {};
   }
 
 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
-DEFINE_NPU_STATUS_TYPE(hcclResult_t, HCCL_SUCCESS);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details
 
 inline std::string build_npu_error_msg(aclError stat) {
@@ -1022,7 +1022,7 @@ inline std::string build_npu_error_msg(aclError stat) {
   return sout.str();
 }
 
-inline std::string build_npu_error_msg(hcclResult_t stat) {
+inline std::string build_npu_error_msg(HcclResult stat) {
   std::ostringstream sout;
   sout << " HCCL error, the error code is : " << stat << ". ";
   return sout.str();
diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h
index e2d6be57ceecb..8eed32b416da7 100644
--- a/paddle/fluid/platform/hccl_helper.h
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL)
 
 #include <stdio.h>
 #include <memory>
@@ -24,30 +24,22 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-#ifdef PADDLE_WITH_NCCL
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif
-
-#ifdef PADDLE_WITH_RCCL
-#include "paddle/fluid/platform/dynload/rccl.h"
-#endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/dynload/hccl.h"
 #endif
 
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
-#define NCCL_ID_VARNAME "NCCLID"
+#define HCCL_ID_VARNAME "HCCLID"
 
 namespace paddle {
 namespace platform {
 
-inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
   if (type == framework::proto::VarType::FP32) {
     return HCCL_DATA_TYPE_FP32;
   } else if (type == framework::proto::VarType::FP16) {
@@ -56,7 +48,7 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
     return HCCL_DATA_TYPE_INT32;
   } else if (type == framework::proto::VarType::INT8) {
     return HCCL_DATA_TYPE_INT8;
-  } 
+  }
   // else if (type == framework::proto::VarType::FP64) {
   //   return HCCL_DATA_TYPE_FP32;
   // }
@@ -66,298 +58,301 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
   }
 }
 
-// // NOTE(minqiyang): according to the ncclGroupEnd documentations:
-// // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// // ncclGroupEnd will wait for all communicators to be initialized, which will
-// // cause blocking problem when a runtime_error was thrown, so try only guard
-// // NCCL actions when use it.
-// class NCCLGroupGuard {
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
 //  public:
-//   static std::mutex &NCCLMutex() {
+//   static std::mutex &HCCLMutex() {
 //     static std::mutex mtx;
 //     return mtx;
 //   }
 
-//   inline NCCLGroupGuard() {
-//     NCCLMutex().lock();
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
 //     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
 //   }
 
-//   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
 //     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
-//     NCCLMutex().unlock();
+//     HCCLMutex().unlock();
 //   }
 // };
 
-// struct NCCLContext {
-//   std::unique_ptr<CUDADeviceContext> ctx_;
-//   ncclComm_t comm_;
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;
 
-//   explicit NCCLContext(int dev_id)
-//       : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
 
-//   gpuStream_t stream() const { return ctx_->stream(); }
-//   ncclComm_t comm() const { return comm_; }
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }
 
-//   int device_id() const {
-//     return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device;
-//   }
-// };
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
 
-// struct NCCLContextMap {
-//   std::unordered_map<int, NCCLContext> contexts_;
-//   std::vector<int> order_;
-
-//   explicit NCCLContextMap(const std::vector<platform::Place> &places,
-//                           ncclUniqueId *nccl_id = nullptr,
-//                           size_t num_trainers = 1, size_t trainer_id = 0) {
-//     PADDLE_ENFORCE_EQ(!places.empty(), true,
-//                       platform::errors::InvalidArgument(
-//                           "The NCCL place should not be empty."));
-//     order_.reserve(places.size());
-//     for (auto &p : places) {
-//       int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
-//       order_.emplace_back(dev_id);
-//       contexts_.emplace(dev_id, NCCLContext(dev_id));
-//     }
-//     PADDLE_ENFORCE_EQ(
-//         order_.size(), contexts_.size(),
-//         platform::errors::Unavailable("NCCL Context Map does not support "
-//                                       "contain two or more same device."));
-
-//     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-//     // if num_trainers == 1, should create a new nccl id for local comms.
-//     if (num_trainers == 1 && nccl_id == nullptr) {
-//       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-//       PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
-//           comms.get(), static_cast<int>(order_.size()), order_.data()));
-//     } else {
-//       PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
-//                                            "The NCCL id should not be null."));
-//       {
-//         int nranks = num_trainers * order_.size();
-//         NCCLGroupGuard gurad;
-//         for (size_t i = 0; i < order_.size(); ++i) {
-//           int gpu_id = order_[i];
-//           int rank;
-//           if (order_.size() > 1) {
-//             rank = trainer_id * order_.size() + i;
-//           } else {
-//             rank = trainer_id;
-//           }
-//           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
-//                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-//           SetDeviceId(gpu_id);
-//           PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
-//               comms.get() + i, nranks, *nccl_id, rank));
-//         }
-//       }
-//     }
-//     int i = 0;
-//     for (auto &dev_id : order_) {
-//       contexts_.at(dev_id).comm_ = comms[i++];
-//     }
-//   }
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
 
-//   NCCLContextMap(const NCCLContextMap &other) = delete;
-//   NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
 
-//   CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }
 
-//   CUDADeviceContext *DevCtx(platform::Place p) const {
-//     return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device);
-//   }
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }
 
-//   const NCCLContext &at(platform::Place p) const {
-//     return this->at(BOOST_GET_CONST(CUDAPlace, p).device);
-//   }
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
 
-//   const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
 
-//   void WaitAll() {
-//     for (auto &p : contexts_) {
-//       p.second.ctx_->Wait();
-//     }
-//   }
-// };
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}
 
-// inline std::string GetFlatNCCLVarName(size_t pos) {
-//   if (pos == 0) {
-//     return NCCL_ID_VARNAME;
-//   }
-//   return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast<int>(pos));
-// }
-
-// inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
-//   return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME,
-//                          static_cast<int>(pos));
-// }
-// inline std::string GetHierarchicalInterNCCLVarName(size_t pos) {
-//   return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME,
-//                          static_cast<int>(pos));
-// }
-
-// class NCCLCommunicator {
-//  public:
-//   NCCLCommunicator() {}
-//   virtual ~NCCLCommunicator() PADDLE_MAY_THROW {}
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
 
-//   NCCLContextMap *DefaultFlatCtx() const {
-//     if (flat_ctxs_.size() == 0) {
-//       return nullptr;
-//     }
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
 
-//     return flat_ctxs_[0].get();
-//   }
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }
 
-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetFlatCtxs() {
-//     return &flat_ctxs_;
-//   }
+    return flat_ctxs_[0].get();
+  }
 
-//   NCCLContextMap *GetFlatCtx(size_t run_order) const {
-//     return flat_ctxs_[run_order % flat_ctxs_.size()].get();
-//   }
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }
 
-//   NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order,
-//                                    bool use_hierarchical_allreduce) const {
-//     if (!use_hierarchical_allreduce) {
-//       return GetFlatCtx(run_order);
-//     }
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }
 
-//     return GetHierarchicalInterCtx(run_order);
-//   }
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }
 
-  
-//    *When nccl inits nccl comm using ncclCommInitAll, it meets error when
-//    *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
-//    *create a new nccl comm for sync_batch_norm_op. And these codes should be
-//    *polished with a unified nccl management.
-  
-//   NCCLContextMap *GetSyncBatchNormCtx(
-//       framework::Scope *scope, const std::vector<platform::Place> &places) {
-//     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-//     if (nccl_id_var != nullptr) {
-//       return DefaultFlatCtx();
-//     }
-
-//     if (sync_batch_norm_ctx_.get() == nullptr) {
-//       sync_batch_norm_ctx_.reset(new NCCLContextMap(places));
-//     }
-//     return sync_batch_norm_ctx_.get();
-//   }
+    return GetHierarchicalInterCtx(run_order);
+  }
 
-//   void InitFlatCtxs(const std::vector<platform::Place> &places,
-//                     const std::vector<ncclUniqueId *> &nccl_ids,
-//                     size_t trainers_num, size_t trainer_id) {
-//     if (nccl_ids.size() == 0) {
-//       auto ptr = new platform::NCCLContextMap(places);
-//       VLOG(1) << "init local trainer";
-//       flat_ctxs_.emplace_back(ptr);
-//     } else {
-//       for (size_t i = 0; i < nccl_ids.size(); i++) {
-//         auto ptr = new platform::NCCLContextMap(places, nccl_ids[i],
-//                                                 trainers_num, trainer_id);
-//         VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
-//         flat_ctxs_.emplace_back(ptr);
-//       }
-//     }
-
-//     // as Executor have no way to use ncclComm created by ParallelExecutor,
-//     // we assign all flatten contexts to NCCLCommContext to fix.
-//     int nranks = static_cast<int>(trainers_num * places.size());
-//     int nrings = static_cast<int>(flat_ctxs_.size());
-//     for (int ring_id = 0; ring_id < nrings; ++ring_id) {
-//       for (size_t p = 0; p < places.size(); ++p) {
-//         int rank = trainer_id * places.size() + p;
-//         int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device;
-//         auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
-//         NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank,
-//                                                    dev_id, ring_id);
-//       }
-//     }
-//   }
 
-//   void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-//                             const std::vector<ncclUniqueId *> &inter_nccl_ids,
-//                             const std::vector<ncclUniqueId *> &exter_nccl_ids,
-//                             size_t trainers_num, size_t trainer_id,
-//                             size_t inter_trainers_num,
-//                             size_t exter_trainers_num) {
-//     PADDLE_ENFORCE_EQ(
-//         trainers_num, inter_trainers_num * exter_trainers_num,
-//         platform::errors::InvalidArgument(
-//             "trainers_num:%llu != inter_trainers_num:%llu * "
-//             "exter_trainers_num:%llu",
-//             trainers_num, inter_trainers_num, exter_trainers_num));
-
-//     PADDLE_ENFORCE_GT(
-//         inter_trainers_num, 1,
-//         platform::errors::InvalidArgument(
-//             "The inter_trainers_num:%llu should be larger than 1.",
-//             inter_trainers_num));
-
-//     int inter_trainer_id = trainer_id % inter_trainers_num;
-//     for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
-//       VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
-//               << ", comm no:" << i;
-//       auto local = new NCCLContextMap(places, inter_nccl_ids[i],
-//                                       inter_trainers_num, inter_trainer_id);
-
-//       h_inter_ctxs_.emplace_back(local);
-//     }
-
-//     int exter_trainer_id = -1;
-//     if (trainer_id % inter_trainers_num == 0) {
-//       exter_trainer_id = trainer_id / inter_trainers_num;
-//     }
-
-//     if (exter_trainer_id >= 0) {
-//       for (size_t i = 0; i < exter_nccl_ids.size(); i++) {
-//         auto ex = new NCCLContextMap(places, exter_nccl_ids[i],
-//                                      exter_trainers_num, exter_trainer_id);
-//         VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
-//                 << ", comm no:" << i;
-//         h_exter_ctxs_.emplace_back(ex);
-//       }
-//     }
-//   }
+   /*
+    When nccl inits nccl comm using ncclCommInitAll, it meets error when
+    allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+    create a new nccl comm for sync_batch_norm_op. And these codes should be
+    polished with a unified nccl management.
+   */
 
-//   bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+  HCCLContextMap *GetSyncBatchNormCtx(framework::Scope* scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }
 
-//   NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-//     PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
-//                       platform::errors::InvalidArgument(
-//                           "Hierarchical ctxs should be initialized firstly!"));
-//     return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
-//   }
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }
 
-//   NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-//     PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
-//                       platform::errors::InvalidArgument(
-//                           "Hierarchical ctxs should be initialized firstly!"));
-//     return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
-//   }
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }
 
-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalInterCtxs() {
-//     return &h_inter_ctxs_;
-//   }
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }
 
-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalExterCtxs() {
-//     return &h_exter_ctxs_;
-//   }
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
 
-//  protected:
-//   // Support multi nccl comm on default nccl ring while NCCLContextMap can't.
-//   std::vector<std::unique_ptr<NCCLContextMap>> flat_ctxs_;
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }
 
-//   // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
-//   // And h_exter_ctxs_ can support multi comm too.
-//   std::vector<std::unique_ptr<NCCLContextMap>> h_inter_ctxs_;
-//   std::vector<std::unique_ptr<NCCLContextMap>> h_exter_ctxs_;
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }
 
-//   // just used for sync_batch_norm op.
-//   std::unique_ptr<NCCLContextMap> sync_batch_norm_ctx_;
-// };
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h
index 05325aaf9baa1..a7b674d0d0c3f 100644
--- a/paddle/fluid/platform/npu_profiler.h
+++ b/paddle/fluid/platform/npu_profiler.h
@@ -23,7 +23,17 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-// For ACL 20.1
+#ifdef PADDLE_WITH_ASCEND_STRING
+// For CANN 20.2+
+// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
+// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
+// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
+// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
+// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMETIC_UTILIZATION;
+#else
+// For CANN 20.1
 // ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
 // ACL_AICORE_PIPELINE = 1, record pipeline
 // ACL_AICORE_SYNCHRONIZATION = 2, record sync
@@ -32,6 +42,7 @@ namespace platform {
 // ACL_AICORE_STALL = 5, record pipeline ratio
 constexpr aclprofAicoreMetrics default_metrics =
     ACL_AICORE_ARITHMATIC_THROUGHPUT;
+#endif
 
 // ACL_PROF_ACL_API, record ACL API stats
 // ACL_PROF_TASK_TIME, record AI core stats
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index 4cda0ceeccf9c..ac80ea4c62cbf 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -14,6 +14,8 @@
 
 import unittest
 import numpy as np
+import sys
+sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 400ddd9d4aab0..2463ddb7137ac 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -41,7 +41,7 @@ def setUp(self):
         vocab = 10
         dim = 20
         w = np.ones([vocab, dim]).astype(self.dtype)
-        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int32)
         out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
index e65a3dac73928..4fcfd33b32f4e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -248,8 +248,9 @@ def test_npu(self):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+        self.assertTrue(np.allclose(
+            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
index 087256b298088..583a648224d73 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -16,6 +16,8 @@
 
 import unittest
 import numpy as np
+import sys
+sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
index 44515ce2e5b94..ed21549b7e01f 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
@@ -36,7 +36,7 @@ def setUp(self):
         self.op_type = "assign"
         self.init_dtype()
 
-        x = np.rand.random([3,3])
+        x = np.random.random([3, 3]).astype(self.dtype)
         self.inputs = {'X': x}
 
         self.attrs = {}
@@ -46,7 +46,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def init_dtype(self):
-        self.dtype = np.int64
+        self.dtype = np.float32
 
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
@@ -54,4 +54,3 @@ def test_check_output(self):
 
 if __name__ == '__main__':
     unittest.main()
-