From 9ba08b10c85a86cf2c1bae1d0447c0f066f00084 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 16 Nov 2021 11:28:38 +0000
Subject: [PATCH 01/17] rename TensorBase interface data_type() to dtype()

---
 paddle/pten/api/lib/tensor.cc               |  4 ++--
 paddle/pten/api/lib/utils/tensor_utils.cc   |  2 +-
 paddle/pten/core/compat_utils.h             |  2 +-
 paddle/pten/core/dense_tensor.cc            | 16 +++++++---------
 paddle/pten/core/dense_tensor.h             |  2 +-
 paddle/pten/core/tensor_base.h              |  2 +-
 paddle/pten/kernels/cpu/utils.cc            |  4 ++--
 paddle/pten/kernels/cuda/math.cu            |  2 +-
 paddle/pten/kernels/cuda/utils.cu           |  4 ++--
 paddle/pten/kernels/xpu/utils.cc            |  4 ++--
 paddle/pten/tests/api/test_tensor_utils.cc  | 11 +++++------
 paddle/pten/tests/core/test_dense_tensor.cc |  2 +-
 12 files changed, 26 insertions(+), 29 deletions(-)
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 5d10c7209e2b8..bb3fba885862b 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -109,9 +109,9 @@ void Tensor::reshape(const std::vector<int64_t> &shape) {
       "and it will be implemented by calling the reshape kernel later."));
 }
 
-DataType Tensor::dtype() const { return impl_->data_type(); }
+DataType Tensor::dtype() const { return impl_->dtype(); }
 
-DataType Tensor::type() const { return impl_->data_type(); }
+DataType Tensor::type() const { return impl_->dtype(); }
 
 DataLayout Tensor::layout() const { return impl_->layout(); }
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 2c362a11b11d9..b02392e5763be 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -125,7 +125,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   auto storage = src->release();
   std::shared_ptr<paddle::memory::allocation::Allocation> holder(
       new TensorStorage(std::move(storage)));
-  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
+  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
 }
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
index dc65e04b3ae73..a0602f33e3de2 100644
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
@@ -74,7 +74,7 @@ class CompatibleDenseTensorUtils {
       ret.meta_.dims[0] = end_idx - begin_idx;
       ret.meta_.offset = tensor->meta_.offset +
                          begin_idx * (tensor->numel() / tensor->dims()[0]) *
-                             paddle::experimental::SizeOf(tensor->data_type());
+                             paddle::experimental::SizeOf(tensor->dtype());
     }
     return ret;
   }
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index fbc2a24312941..bb38c53ada04e 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -24,14 +24,12 @@ namespace pten {
 DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
                          const DenseTensorMeta& meta)
     : meta_(meta),
-      storage_(
-          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+      storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
 DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
                          DenseTensorMeta&& meta)
     : meta_(std::move(meta)),
-      storage_(
-          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+      storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
 DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
                          const DenseTensorMeta& meta)
@@ -60,7 +58,7 @@ void* DenseTensor::mutable_data(size_t request_bytes) {
       storage_,
       paddle::platform::errors::PreconditionNotMet(
           "The storage must be valid when call the mutable data function."));
-  size_t bytes = numel() * SizeOf(data_type());
+  size_t bytes = numel() * SizeOf(dtype());
   if (request_bytes) {
     PADDLE_ENFORCE_GE(request_bytes,
                       bytes,
@@ -87,19 +85,19 @@ T* DenseTensor::mutable_data() {
         paddle::experimental::CppTypeToDataType<T>::Type();
   }
   PADDLE_ENFORCE(
-      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
       paddle::platform::errors::InvalidArgument(
           "The type of data (%d) we are trying to retrieve does not match the "
           "type of data currently contained in the container (%d).",
           static_cast<int>(paddle::experimental::CppTypeToDataType<T>::Type()),
-          static_cast<int>(data_type())));
+          static_cast<int>(dtype())));
   return static_cast<T*>(mutable_data());
 }
 
 template <typename T>
 const T* DenseTensor::data() const {
   PADDLE_ENFORCE(
-      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      (dtype() == paddle::experimental::CppTypeToDataType<T>::Type()),
       paddle::platform::errors::InvalidArgument(
           "The type of data we are trying to retrieve does not match the "
           "type of data currently contained in the container."));
@@ -115,7 +113,7 @@ const void* DenseTensor::data() const {
 }
 
 void DenseTensor::check_memory_size() const {
-  size_t bytes = numel() * SizeOf(data_type());
+  size_t bytes = numel() * SizeOf(dtype());
   PADDLE_ENFORCE_GE(memory_size(),
                     bytes,
                     paddle::platform::errors::InvalidArgument(
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index b0a4195bc6cec..8ece80f529161 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -93,7 +93,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  DataType data_type() const noexcept override { return meta_.type; }
+  DataType dtype() const noexcept override { return meta_.type; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 79fd742aea10b..528a52cee8da4 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -43,7 +43,7 @@ class TensorBase {
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  virtual DataType data_type() const = 0;
+  virtual DataType dtype() const = 0;
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
index 3e0bfccb1ec72..e089eabb0e533 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -38,8 +38,8 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
   VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
   CHECK(dst->layout() == src.layout());
 
-  auto size = src.numel() * paddle::framework::SizeOfType(
-                                TransToProtoVarType(src.data_type()));
+  auto size = src.numel() *
+              paddle::framework::SizeOfType(TransToProtoVarType(src.dtype()));
 
   if (paddle::platform::is_cpu_place(src_place) &&
       paddle::platform::is_cpu_place(dst_place)) {
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 73a743d58e6a9..6a64290d39837 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -81,7 +81,7 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
       dev_ctx.GetPlace());
   pten::DenseTensor tmp(
       alloc,
-      DenseTensorMeta(x.data_type(),
+      DenseTensorMeta(x.dtype(),
                       paddle::framework::make_ddim(
                           {static_cast<int64_t>(temp_storage_bytes)}),
                       x.layout()));
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
index c3940b42ca46e..04cf1413cba68 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -48,8 +48,8 @@ void Copy(const CUDAContext& dev_ctx,
   VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
   CHECK(dst->layout() == src.layout());
 
-  auto size = src.numel() * paddle::framework::SizeOfType(
-                                TransToProtoVarType(src.data_type()));
+  auto size = src.numel() *
+              paddle::framework::SizeOfType(TransToProtoVarType(src.dtype()));
 
   if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
       paddle::platform::is_cuda_pinned_place(dst_place)) {
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
index 33bdc66ff01f3..9bfe493f5ff9d 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -38,8 +38,8 @@ void Copy(const XPUDeviceContext& dev_ctx,
           << dst_place;
   dst->Resize(src.dims());
   CHECK(dst->layout() == src.layout());
-  auto size = src.numel() * paddle::framework::SizeOfType(
-                                TransToProtoVarType(src.data_type()));
+  auto size = src.numel() *
+              paddle::framework::SizeOfType(TransToProtoVarType(src.dtype()));
 
   if (paddle::platform::is_xpu_place(src_place) &&  // NOLINT
       paddle::platform::is_cpu_place(dst_place)) {
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
index fd52b96542c71..cebb0fc07ee63 100644
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ b/paddle/pten/tests/api/test_tensor_utils.cc
@@ -47,8 +47,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
   CHECK(dense_tensor.lod()[0] ==
         static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
-  CHECK(dense_tensor.data_type() ==
-        pten::TransToPtenDataType(lod_tensor.type()));
+  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type()));
   CHECK(dense_tensor.layout() ==
         pten::TransToPtenDataLayout(lod_tensor.layout()));
   CHECK(platform::is_cpu_place(lod_tensor.place()));
@@ -58,7 +57,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
 
   auto dense_tensor_1 = MakePtenDenseTensor(lod_tensor);
   CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->dtype() == dtype);
   CHECK(dense_tensor_1->layout() == layout);
   CHECK(dense_tensor_1->lod().size() == lod.size());
   CHECK(dense_tensor_1->lod()[0] == lod[0]);
@@ -83,7 +82,7 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
   framework::Tensor tensor;
   MovesStorage(&dense_tensor, &tensor);
 
-  CHECK(dense_tensor.data_type() == pten::TransToPtenDataType(tensor.type()));
+  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(tensor.type()));
   CHECK(dense_tensor.layout() == pten::TransToPtenDataLayout(tensor.layout()));
   CHECK(platform::is_cpu_place(tensor.place()));
 
@@ -92,7 +91,7 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
 
   auto dense_tensor_1 = MakePtenDenseTensor(tensor);
   CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->dtype() == dtype);
   CHECK(dense_tensor_1->layout() == layout);
   const float* data_1 = dense_tensor_1->data<float>();
   CHECK(data_1[0] == 1.0f);
@@ -117,7 +116,7 @@ TEST(PtenUtils, VarToPtTensor) {
   // 2. test API
   auto tensor_x = MakePtenTensorBaseFromVar(v, tensor_def);
   // 3. check result
-  ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
+  ASSERT_EQ(tensor_x->dtype(), pten::DataType::INT32);
 }
 
 }  // namespace tests
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index dac2575713bfb..69c5e9b12606b 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -82,7 +82,7 @@ TEST(dense_tensor, ctor) {
     bool r{true};
     r = r && (t.numel() == product(m.dims));
     r = r && (t.dims() == m.dims);
-    r = r && (t.data_type() == m.type);
+    r = r && (t.dtype() == m.type);
     r = r && (t.layout() == m.layout);
     r = r && (t.place() == paddle::platform::CPUPlace());
     r = r && t.initialized();

From 3c1afc0598df2ca15dc6b40f0484a56c92ac97ba Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 17 Nov 2021 05:12:57 +0000
Subject: [PATCH 02/17] rename type to dtype of TensorMeta

---
 paddle/pten/api/lib/utils/tensor_utils.cc     |  4 ++--
 paddle/pten/core/dense_tensor.cc              |  4 ++--
 paddle/pten/core/dense_tensor.h               |  2 +-
 paddle/pten/core/tensor_meta.h                | 24 +++++++++----------
 paddle/pten/infermeta/binary.cc               |  6 ++---
 paddle/pten/infermeta/unary.cc                |  8 +++----
 paddle/pten/tests/core/test_dense_tensor.cc   | 12 +++++-----
 paddle/pten/tests/kernels/test_dot_dev_api.cc |  2 +-
 .../tests/kernels/test_elementwise_dev_api.cc |  2 +-
 .../pten/tests/kernels/test_fill_dev_api.cc   |  2 +-
 .../tests/kernels/test_flatten_dev_api.cc     |  2 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |  2 +-
 .../tests/kernels/test_reshape_dev_api.cc     |  2 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |  4 ++--
 14 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index b02392e5763be..4936006d26f8a 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -146,7 +146,7 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
   // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  const_cast<DataType&>(meta->dtype) = pten::TransToPtenDataType(src.type());
   // Since the type of DenseTensorMeta is const, const_cast must be used
   const_cast<DataLayout&>(meta->layout) =
       pten::TransToPtenDataLayout(src.layout());
@@ -164,7 +164,7 @@ void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
   // Since the type of DenseTensorMeta is const, const_cast must be used
-  const_cast<DataType&>(meta->type) = pten::TransToPtenDataType(src.type());
+  const_cast<DataType&>(meta->dtype) = pten::TransToPtenDataType(src.type());
   // Since the type of DenseTensorMeta is const, const_cast must be used
   const_cast<DataLayout&>(meta->layout) =
       pten::TransToPtenDataLayout(src.layout());
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index bb38c53ada04e..701ccb509f15c 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -80,8 +80,8 @@ T* DenseTensor::mutable_data() {
   // In order to be compatible with the original Tensor design and
   // execution system, we have to reset the datatype in mutable_data<T>.
   // When the compatibility phase is over in the future, we can delete it
-  if (meta_.type == DataType::UNDEFINED) {
-    const_cast<DataType&>(meta_.type) =
+  if (meta_.dtype == DataType::UNDEFINED) {
+    const_cast<DataType&>(meta_.dtype) =
         paddle::experimental::CppTypeToDataType<T>::Type();
   }
   PADDLE_ENFORCE(
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 8ece80f529161..6d938d8ab5790 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -93,7 +93,7 @@ class DenseTensor : public TensorBase,
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  DataType dtype() const noexcept override { return meta_.type; }
+  DataType dtype() const noexcept override { return meta_.dtype; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index eae270171d88e..7cbc919dab985 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -39,9 +39,9 @@ struct DenseTensorMeta {
   using DataLayout = paddle::experimental::DataLayout;
 
   DenseTensorMeta() = default;
-  DenseTensorMeta(DataType type, const DDim& dims);
-  DenseTensorMeta(DataType type, const DDim& dims, DataLayout layout);
-  DenseTensorMeta(DataType type,
+  DenseTensorMeta(DataType dtype, const DDim& dims);
+  DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout);
+  DenseTensorMeta(DataType dtype,
                   const DDim& dims,
                   DataLayout layout,
                   const std::vector<std::vector<size_t>>& lod);
@@ -54,30 +54,30 @@ struct DenseTensorMeta {
   /// marked with `const` are expected to remain unchanged.
   const bool is_scalar{false};
   DDim dims;
-  const DataType type{DataType::UNDEFINED};
+  const DataType dtype{DataType::UNDEFINED};
   const DataLayout layout{DataLayout::NCHW};
   LoD lod;
   size_t offset{0};
 };
 
-inline DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
-    : dims(dims), type(type) {}
+inline DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
+    : dims(dims), dtype(dtype) {}
 
-inline DenseTensorMeta::DenseTensorMeta(DataType type,
+inline DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                         const DDim& dims,
                                         DataLayout layout)
-    : dims(dims), type(type), layout(layout) {}
+    : dims(dims), dtype(dtype), layout(layout) {}
 
 inline DenseTensorMeta::DenseTensorMeta(
-    DataType type,
+    DataType dtype,
     const DDim& dims,
     DataLayout layout,
     const std::vector<std::vector<size_t>>& lod)
-    : dims(dims), type(type), layout(layout), lod(lod) {}
+    : dims(dims), dtype(dtype), layout(layout), lod(lod) {}
 
 inline bool DenseTensorMeta::valid() const noexcept {
   bool valid{true};
-  valid = valid && (type != DataType::UNDEFINED);
+  valid = valid && (dtype != DataType::UNDEFINED);
   valid = valid && (layout != DataLayout::UNDEFINED);
   valid = valid && (is_scalar || product(dims) >= 0);
   return valid;
@@ -86,7 +86,7 @@ inline bool DenseTensorMeta::valid() const noexcept {
 inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
   bool ret = true;
   return ret && (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
-         (lhs.type == rhs.type) && (lhs.layout == rhs.layout) &&
+         (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) &&
          (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
 }
 
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index e124466a6d33a..838e450007fcd 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -56,7 +56,7 @@ DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta,
                         y_dims.to_str()));
 
   x_dims[x_dims.size() - 1] = 1;
-  DenseTensorMeta return_meta(x_meta.type, x_dims, x_meta.layout);
+  DenseTensorMeta return_meta(x_meta.dtype, x_dims, x_meta.layout);
   return return_meta;
 }
 
@@ -127,13 +127,13 @@ DenseTensorMeta MatmulInferShape(const DenseTensorMeta& x_meta,
 
   auto ddim_out = paddle::framework::make_ddim(new_dims);
 
-  return {x_meta.type, ddim_out, x_meta.layout};
+  return {x_meta.dtype, ddim_out, x_meta.layout};
 }
 
 DenseTensorMeta ElementwiseInferShape(const DenseTensorMeta& x_meta,
                                       const DenseTensorMeta& y_meta,
                                       int axis) {
-  DenseTensorMeta return_meta(x_meta.type, x_meta.dims, x_meta.layout);
+  DenseTensorMeta return_meta(x_meta.dtype, x_meta.dims, x_meta.layout);
   if (x_meta.dims != y_meta.dims) {
     auto x_dims = x_meta.dims;
     auto y_dims = y_meta.dims;
diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc
index 5099984886cce..ea6e97db3460d 100644
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -23,7 +23,7 @@ DenseTensorMeta UnchangedInferShape(const DenseTensorMeta& x_meta) {
 
 DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta) {
   const auto& out_dims = paddle::framework::make_ddim({1});
-  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
+  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
   return return_meta;
 }
 
@@ -63,7 +63,7 @@ DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
     out_shape.push_back(x_dims[i]);
   }
   const auto& out_dims = paddle::framework::make_ddim(out_shape);
-  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
+  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
 
   if (x_dims[0] == return_meta.dims[0]) {
     // Only pass LoD when the first dimension of output and Input(X)
@@ -77,7 +77,7 @@ DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
 DenseTensorMeta FullLikeInferShape(const DenseTensorMeta& x_meta,
                                    DataType dtype,
                                    DataLayout layout) {
-  return {dtype == DataType::UNDEFINED ? x_meta.type : dtype,
+  return {dtype == DataType::UNDEFINED ? x_meta.dtype : dtype,
           x_meta.dims,
           layout == DataLayout::UNDEFINED ? x_meta.layout : layout};
 }
@@ -211,7 +211,7 @@ DenseTensorMeta InferShapeFromVecValue(const DenseTensorMeta& x_meta,
                         "But received 'shape' is empty."));
   auto x_dims = x_meta.dims;
   auto out_dims = ValidateShape(shape, x_dims);
-  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
+  DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
   if (x_dims[0] == return_meta.dims[0]) {
     // Only pass LoD when the first dimension of output and Input(X)
     // are the same.
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 69c5e9b12606b..fae4a5415a372 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -31,32 +31,32 @@ TEST(dense_tensor, meta) {
   CHECK(!meta_0.valid());
 
   DenseTensorMeta meta_1(dtype, dims);
-  CHECK(meta_1.type == dtype);
+  CHECK(meta_1.dtype == dtype);
   CHECK(meta_1.dims == dims);
   CHECK(meta_1.valid());
 
   DenseTensorMeta meta_2(dtype, dims, layout);
-  CHECK(meta_2.type == dtype);
+  CHECK(meta_2.dtype == dtype);
   CHECK(meta_2.dims == dims);
   CHECK(meta_2.layout == layout);
   CHECK(meta_2.valid());
 
   DenseTensorMeta meta_3(dtype, dims, layout, lod);
-  CHECK(meta_3.type == dtype);
+  CHECK(meta_3.dtype == dtype);
   CHECK(meta_3.dims == dims);
   CHECK(meta_3.layout == layout);
   CHECK(meta_3.lod == lod);
   CHECK(meta_3.valid());
 
   DenseTensorMeta meta_4(meta_3);
-  CHECK(meta_4.type == dtype);
+  CHECK(meta_4.dtype == dtype);
   CHECK(meta_4.dims == dims);
   CHECK(meta_4.layout == layout);
   CHECK(meta_4.lod == lod);
   CHECK(meta_4.valid());
 
   DenseTensorMeta meta_5(std::move(meta_4));
-  CHECK(meta_5.type == dtype);
+  CHECK(meta_5.dtype == dtype);
   CHECK(meta_5.dims == dims);
   CHECK(meta_5.layout == layout);
   CHECK(meta_5.lod == lod);
@@ -82,7 +82,7 @@ TEST(dense_tensor, ctor) {
     bool r{true};
     r = r && (t.numel() == product(m.dims));
     r = r && (t.dims() == m.dims);
-    r = r && (t.dtype() == m.type);
+    r = r && (t.dtype() == m.dtype);
     r = r && (t.layout() == m.layout);
     r = r && (t.place() == paddle::platform::CPUPlace());
     r = r && t.initialized();
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 2276d49590a70..5485ef2843c2c 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -62,7 +62,7 @@ TEST(DEV_API, dot) {
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.dims()[0], 3);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = sum;
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index f6b93b731865c..f2525ae800acc 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -65,7 +65,7 @@ TEST(DEV_API, elementwise_add) {
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
   ASSERT_EQ(dense_out.dims()[0], 3);
-  ASSERT_EQ(dense_out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(dense_out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(dense_out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = sum;
diff --git a/paddle/pten/tests/kernels/test_fill_dev_api.cc b/paddle/pten/tests/kernels/test_fill_dev_api.cc
index 6e6af22f6de89..aa66877881b66 100644
--- a/paddle/pten/tests/kernels/test_fill_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_fill_dev_api.cc
@@ -50,7 +50,7 @@ TEST(DEV_API, fill_any_like) {
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.dims()[0], 3);
   ASSERT_EQ(out.numel(), 6);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   auto* actual_result = out.data<float>();
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index b027c75a37b31..a9be6108d24b6 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -56,7 +56,7 @@ TEST(DEV_API, flatten) {
   ASSERT_EQ(out.dims()[1], expect_shape[1]);
   ASSERT_EQ(out.dims()[2], expect_shape[2]);
   ASSERT_EQ(out.numel(), 36);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   bool value_equal = true;
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 1ae59ff8034f5..b16d339e18af3 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -49,7 +49,7 @@ TEST(DEV_API, mean) {
   // 3. check result
   ASSERT_EQ(out.dims().size(), 1);
   ASSERT_EQ(out.numel(), 1);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = sum / 12;
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index c06cc8a8a406b..b227d3b009e89 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -54,7 +54,7 @@ TEST(DEV_API, reshape) {
   ASSERT_EQ(out.dims()[0], expect_shape[0]);
   ASSERT_EQ(out.dims()[1], expect_shape[1]);
   ASSERT_EQ(out.numel(), 36);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   bool value_equal = true;
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index b057821e6cf81..b87692137251a 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -56,7 +56,7 @@ TEST(DEV_API, scale) {
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.numel(), 12);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = 23;
@@ -101,7 +101,7 @@ TEST(DEV_API, scale_host) {
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);
   ASSERT_EQ(out.numel(), 12);
-  ASSERT_EQ(out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
   ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = 23;

From 7bc3cbb5431a2b37ee52e91314df407894718457 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 17 Nov 2021 13:27:33 +0000
Subject: [PATCH 03/17] merge the code

---
 paddle/pten/core/tensor_meta.h | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index b311589d792eb..cc02c57a48ba1 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -60,37 +60,4 @@ struct DenseTensorMeta {
   size_t offset{0};
 };
 
-<<<<<<< HEAD
-inline DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
-    : dims(dims), dtype(dtype) {}
-
-inline DenseTensorMeta::DenseTensorMeta(DataType dtype,
-                                        const DDim& dims,
-                                        DataLayout layout)
-    : dims(dims), dtype(dtype), layout(layout) {}
-
-inline DenseTensorMeta::DenseTensorMeta(
-    DataType dtype,
-    const DDim& dims,
-    DataLayout layout,
-    const std::vector<std::vector<size_t>>& lod)
-    : dims(dims), dtype(dtype), layout(layout), lod(lod) {}
-
-inline bool DenseTensorMeta::valid() const noexcept {
-  bool valid{true};
-  valid = valid && (dtype != DataType::UNDEFINED);
-  valid = valid && (layout != DataLayout::UNDEFINED);
-  valid = valid && (is_scalar || product(dims) >= 0);
-  return valid;
-}
-
-inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
-  bool ret = true;
-  return ret && (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
-         (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) &&
-         (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
-}
-
-=======
->>>>>>> d08753df36986f5a5a7384f092c578c296e5150b
 }  // namespace pten

From 7b79b03e0196ad72420a54efddf8244c7dc4271a Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 17 Nov 2021 13:57:02 +0000
Subject: [PATCH 04/17] merge the code

---
 paddle/pten/core/tensor_meta.cc | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/pten/core/tensor_meta.cc b/paddle/pten/core/tensor_meta.cc
index ebdcd9b5f250b..6bc28908a2570 100644
--- a/paddle/pten/core/tensor_meta.cc
+++ b/paddle/pten/core/tensor_meta.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 
 namespace pten {
 
-DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
-    : dims(dims), type(type) {}
+DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
+    : dims(dims), dtype(dtype) {}
 
-DenseTensorMeta::DenseTensorMeta(DataType type,
+DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& dims,
                                  DataLayout layout)
-    : dims(dims), type(type), layout(layout) {}
+    : dims(dims), dtype(dtype), layout(layout) {}
 
 DenseTensorMeta::DenseTensorMeta(DataType type,
                                  const DDim& dims,
                                  DataLayout layout,
                                  const std::vector<std::vector<size_t>>& lod)
-    : dims(dims), type(type), layout(layout), lod(lod) {}
+    : dims(dims), dtype(dtype), layout(layout), lod(lod) {}
 
 bool DenseTensorMeta::valid() const noexcept {
   bool valid{true};
-  valid = valid && (type != DataType::UNDEFINED);
+  valid = valid && (dtype != DataType::UNDEFINED);
   valid = valid && (layout != DataLayout::UNDEFINED);
   valid = valid && (is_scalar || product(dims) >= 0);
   return valid;
@@ -41,7 +41,7 @@ bool DenseTensorMeta::valid() const noexcept {
 bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
   bool ret = true;
   return ret && (lhs.is_scalar == rhs.is_scalar) && (lhs.dims == rhs.dims) &&
-         (lhs.type == rhs.type) && (lhs.layout == rhs.layout) &&
+         (lhs.dtype == rhs.dtype) && (lhs.layout == rhs.layout) &&
          (lhs.lod == rhs.lod) && (lhs.offset == rhs.offset);
 }
 }  // namespace pten

From 471a1bf7de8a0874f5a2254859545c03758450cb Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 18 Nov 2021 02:48:40 +0000
Subject: [PATCH 05/17] fix the problem when merge conflict

---
 paddle/pten/core/tensor_meta.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/core/tensor_meta.cc b/paddle/pten/core/tensor_meta.cc
index 6bc28908a2570..3e06508be69d6 100644
--- a/paddle/pten/core/tensor_meta.cc
+++ b/paddle/pten/core/tensor_meta.cc
@@ -24,7 +24,7 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  DataLayout layout)
     : dims(dims), dtype(dtype), layout(layout) {}
 
-DenseTensorMeta::DenseTensorMeta(DataType type,
+DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& dims,
                                  DataLayout layout,
                                  const std::vector<std::vector<size_t>>& lod)

From 835e4156785eb799a84757f12c5687065cb24119 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 19 Nov 2021 07:22:00 +0000
Subject: [PATCH 06/17] fix bug of ci caused by type of tensor_meta

---
 paddle/pten/tests/kernels/test_elementwise_dev_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index c6e0d33991544..8dafce1fba7d8 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -118,7 +118,7 @@ TEST(DEV_API, subtract) {
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
   ASSERT_EQ(dense_out.dims()[0], 3);
-  ASSERT_EQ(dense_out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(dense_out.dtype(), pten::DataType::FLOAT32);
   ASSERT_EQ(dense_out.meta().layout, pten::DataLayout::NCHW);
 
   auto expect_result = sub;

From 20ebc3f76e09358315f354d71a20bc5adeedfb46 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Thu, 24 Mar 2022 11:34:56 +0000
Subject: [PATCH 07/17] update

---
 .../distributed/collective/CMakeLists.txt     |   6 +-
 paddle/fluid/distributed/collective/Common.cc |  54 +++++
 paddle/fluid/distributed/collective/Common.h  |  33 ++++
 .../fluid/distributed/collective/HCCLTools.cc |  39 ++++
 .../fluid/distributed/collective/HCCLTools.h  |   4 +
 .../fluid/distributed/collective/NCCLTools.cc |  46 +++++
 .../fluid/distributed/collective/NCCLTools.h  |   5 +
 .../collective/ProcessGroupHCCL.cc            |  51 +----
 .../collective/ProcessGroupHeter.cc           | 185 ++++++++++++++++++
 .../collective/ProcessGroupHeter.h            | 106 ++++++++++
 .../collective/ProcessGroupNCCL.cc            |  56 +-----
 11 files changed, 479 insertions(+), 106 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/Common.cc
 create mode 100644 paddle/fluid/distributed/collective/Common.h
 create mode 100644 paddle/fluid/distributed/collective/HCCLTools.cc
 create mode 100644 paddle/fluid/distributed/collective/NCCLTools.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHeter.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHeter.h

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 49ba9479d49e9..26c06f2ec20a3 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -6,8 +6,10 @@ if (WITH_DISTRIBUTE)
 endif()
 
 if(WITH_NCCL)
-    cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
 if(WITH_ASCEND_CL)
-    cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+  cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
+  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()
diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc
new file mode 100644
index 0000000000000..02eab58478ccc
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Common.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/Common.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
+  std::vector<Place> places;
+  places.reserve(tensors.size());
+  for (auto& tensor : tensors) {
+    places.push_back(tensor.inner_place());
+  }
+  return places;
+}
+
+std::string GetKeyFromPlaces(const std::vector<Place>& places) {
+  std::string placeList;
+  for (auto& place : places) {
+    std::stringstream tmp;
+    tmp << place;
+    if (placeList.empty()) {
+      placeList += tmp.str();
+    } else {
+      placeList += "," + tmp.str();
+    }
+  }
+  return placeList;
+}
+
+static bool CheckTensorsInPlace(const std::vector<Tensor>& tensors,
+                                const PlaceType type) {
+  return std::all_of(tensors.cbegin(), tensors.cend(),
+                     [&](const Tensor& t) { return t.place() == type; });
+}
+
+bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors) {
+  return CheckTensorsInPlace(tensors, PlaceType::kGPU);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h
new file mode 100644
index 0000000000000..9569f4c61acef
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Common.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+namespace paddle {
+namespace distributed {
+
+using Tensor = paddle::experimental::Tensor;
+
+using Place = paddle::platform::Place;
+// Get the list of devices from list of tensors
+std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors);
+// Get the deviceList String from the list of devices
+std::string GetKeyFromPlaces(const std::vector<Place>& places);
+
+bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors);
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/HCCLTools.cc b/paddle/fluid/distributed/collective/HCCLTools.cc
new file mode 100644
index 0000000000000..c9f69447340db
--- /dev/null
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+
+HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, HcclReduceOp> red_type = {
+      {ReduceOp::MIN, HCCL_REDUCE_MIN},
+      {ReduceOp::MAX, HCCL_REDUCE_MAX},
+      {ReduceOp::SUM, HCCL_REDUCE_SUM},
+      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(
+      it != red_type.end(), true,
+      platform::errors::InvalidArgument("Invalid hccl reduction. "
+                                        "Must be Min | Max | Prod | Sum"));
+  return it->second;
+}
+
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
+  std::ostringstream oss;
+  for (size_t i = 0; i < sizeof(hcclID); ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h
index 09789bd4d3786..a1dcf7cd9b626 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.h
+++ b/paddle/fluid/distributed/collective/HCCLTools.h
@@ -18,6 +18,7 @@
 #include <string>
 
 #include "boost/variant.hpp"
+#include "paddle/fluid/distributed/collective/Types.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/collective_helper.h"
@@ -170,5 +171,8 @@ class HCCLCommManager {
   mutable std::mutex mutex_;
 };
 
+HcclReduceOp ToHCCLRedType(ReduceOp reduction);
+std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID);
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc
new file mode 100644
index 0000000000000..7e842ebf92166
--- /dev/null
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
+
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
+  };
+  auto it = red_type.find(reduction);
+  PADDLE_ENFORCE_EQ(it != red_type.end(), true,
+                    platform::errors::InvalidArgument(
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
+  return it->second;
+}
+
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
+  std::ostringstream oss;
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+    oss << std::hex << static_cast<int>(bytes[i]);
+  }
+  return oss.str();
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index f30b96e72d453..0454518b1836c 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -26,6 +26,8 @@
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/distributed/collective/Types.h"
+
 namespace paddle {
 namespace distributed {
 
@@ -194,5 +196,8 @@ class NCCLCommManager {
   mutable std::mutex mutex_;
 };
 
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 2deeb7ca03003..402f45707d6b1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#include "paddle/fluid/distributed/collective/Common.h"
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -28,55 +30,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
 namespace paddle {
 namespace distributed {
 
-static HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
-  static const std::map<ReduceOp, HcclReduceOp> red_type = {
-      {ReduceOp::MIN, HCCL_REDUCE_MIN},
-      {ReduceOp::MAX, HCCL_REDUCE_MAX},
-      {ReduceOp::SUM, HCCL_REDUCE_SUM},
-      {ReduceOp::PRODUCT, HCCL_REDUCE_PROD},
-  };
-  auto it = red_type.find(reduction);
-  PADDLE_ENFORCE_EQ(
-      it != red_type.end(), true,
-      platform::errors::InvalidArgument("Invalid hccl reduction. "
-                                        "Must be Min | Max | Prod | Sum"));
-  return it->second;
-}
-
-std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
-  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&hcclID);
-  std::ostringstream oss;
-  for (size_t i = 0; i < sizeof(hcclID); ++i) {
-    oss << std::hex << static_cast<int>(bytes[i]);
-  }
-  return oss.str();
-}
-
-// Get the list of devices from list of tensors
-std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
-  std::vector<Place> places;
-  places.reserve(tensors.size());
-  for (auto& tensor : tensors) {
-    places.push_back(tensor.inner_place());
-  }
-  return places;
-}
-
-// Get the deviceList String from the list of devices
-std::string GetKeyFromPlaces(const std::vector<Place>& places) {
-  std::string placeList;
-  for (auto& place : places) {
-    std::stringstream tmp;
-    tmp << place;
-    if (placeList.empty()) {
-      placeList += tmp.str();
-    } else {
-      placeList += "," + tmp.str();
-    }
-  }
-  return placeList;
-}
-
 // bool CheckTensorsInNPUPlace(const std::vector<Tensor>& tensors) {
 //   return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
 //     return t.place() == platform::DeviceType::NPU;
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
new file mode 100644
index 0000000000000..42ea0d46cd6da
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/ProcessGroupHeter.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/common/place.h"
+
+constexpr int64_t kWaitBlockTImeout = 10;
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+
+std::shared_ptr<ProcessGroupHeter::HeterTask> ProcessGroupHeter::CreateTask(
+    int rank, CommType comm_type, const std::vector<Tensor>& inputs) {
+  return std::make_shared<ProcessGroupHeter::HeterTask>(rank, comm_type,
+                                                        inputs);
+}
+
+ProcessGroupHeter::HeterTask::HeterTask(int rank, CommType CommType,
+                                        const std::vector<Tensor>& inputs)
+    : Task(rank, inputs, CommType) {}
+
+ProcessGroupHeter::HeterTask::~HeterTask() {}
+
+bool ProcessGroupHeter::HeterTask::IsCompleted() { return true; }
+
+// TODO(sheniang03): Add timeout for wait, now timeout unused
+bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
+  return true;
+}
+
+ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
+                                     int rank, int size, int local_rank,
+                                     int local_size, bool with_switch,
+                                     int gloo_rank, int gloo_size)
+    : ProcessGroup(rank, size),
+      store_(store),
+      local_rank_(local_rank),
+      local_size_(local_size),
+      with_switch_(with_switch),
+      gloo_rank_(gloo_rank),
+      gloo_size_(gloo_size) {
+#if defined(PADDLE_WITH_NCCL)
+  inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size);
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  inner_pg_ = std::make_shared<ProcessGroupHCCL>(store, local_rank, local_size);
+#else
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroupHeter only supports NCCL and HCCL now.");
+#endif
+  if (with_switch_) {
+    // TODO(sandyhouse) starts a client to connect the cloud switch module
+  } else if (local_rank_ == 0) {
+    auto opts = ProcessGroupGloo::GlooOptions::create();
+    opts->device = ProcessGroupGloo::createDefaultDevice();
+    inter_pg_ =
+        std::make_shared<ProcessGroupGloo>(store, gloo_rank_, gloo_size_, opts);
+  }
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
+    std::vector<Tensor>& tensors, const AllreduceOptions& opts) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInNPUPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in NPUPlace."));
+#endif
+
+  // Step1: do allreduce in inner cluster
+  auto task = inner_pg_->AllReduce(tensors, opts);
+  task->Wait();
+
+  // Step2: copy tensors to CPU
+  if (local_rank_ == 0) {
+    std::vector<Tensor> cpu_tensors(tensors.size());
+    for (size_t i = 0; i < tensors.size(); i++) {
+      auto dense_gpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      auto dense_cpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(cpu_tensors[i].impl());
+      dense_cpu_tensor->Resize(tensors[i].dims());
+      framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(),
+                                dense_cpu_tensor.get());
+    }
+    // Step3: do inter cluster allreduce
+    if (with_switch_) {
+      // TODO(sandyhouse) send to and recv from switch, and do add
+    } else {
+      auto gloo_task = inter_pg_->AllReduce(cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    // Step4: copy cpu tensors to gpu
+    // TODO(sandyhouse)
+    // copy cpu tensors to gpu
+    for (size_t i = 0; i < tensors.size(); i++) {
+      auto dense_gpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      auto dense_cpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(cpu_tensors[i].impl());
+      // framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(),
+      // dense_gpu_tensor.get());
+      framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(),
+                                dense_gpu_tensor.get());
+    }
+  }
+
+  // Step5: broadcast among inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_root = 0;
+  auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::ALLREDUCE, tensors);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
+    std::vector<Tensor>& tensors, const BroadcastOptions& opts) {
+#if defined(PADDLE_WITH_NCCL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInNPUPlace(tensors), true,
+      platform::errors::InvalidArgument("All inputs should be in NPUPlace."));
+#endif
+
+  // Step1: do broadcast in inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_root = 0;
+  inner_pg_->Broadcast(tensors, b_opts);
+
+  if (local_rank_ == 0) {
+    std::vector<Tensor> cpu_tensors(tensors.size());
+    for (size_t i = 0; i < tensors.size(); i++) {
+      auto dense_gpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      auto dense_cpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(cpu_tensors[i].impl());
+      dense_cpu_tensor->Resize(tensors[i].dims());
+      framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(),
+                                dense_cpu_tensor.get());
+    }
+    if (with_switch_) {
+      // TODO(sandyhouse) send to and recv
+    } else {
+      auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    for (size_t i = 0; i < tensors.size(); i++) {
+      auto dense_gpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(tensors[i].impl());
+      auto dense_cpu_tensor =
+          std::dynamic_pointer_cast<phi::DenseTensor>(cpu_tensors[i].impl());
+      // framework::TensorCopySync(*dense_cpu_tensor, tensors[i].place(),
+      // dense_gpu_tensor.get());
+      framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(),
+                                dense_gpu_tensor.get());
+    }
+  }
+  auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts);
+  broadcast_task->Wait();
+  return CreateTask(rank_, CommType::BROADCAST, tensors);
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
new file mode 100644
index 0000000000000..95b6961e23f21
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h"
+#endif
+
+#include "paddle/fluid/distributed/collective/Common.h"
+
+constexpr const char* HETER_BACKEND_NAME = "HETER_BACKEND";
+
+namespace paddle {
+namespace distributed {
+
+using Place = paddle::platform::Place;
+
+class ProcessGroupHeter : public ProcessGroup {
+ public:
+  class HeterTask : public ProcessGroup::Task,
+                    public std::enable_shared_from_this<HeterTask> {
+   public:
+    HeterTask(int rank, CommType CommType, const std::vector<Tensor>& inputs);
+
+    bool IsCompleted();
+
+    void SynchronizeStreams() {}
+
+    bool Wait(std::chrono::milliseconds timeout = kWaitTimeout);
+
+    void Synchronize() {}
+
+    virtual ~HeterTask();
+  };
+
+  ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
+                    int local_rank, int local_size, bool with_switch,
+                    int gloo_rank, int gloo_size);
+
+  const std::string GetBackendName() const override {
+    return std::string(HETER_BACKEND_NAME);
+  }
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& tensors,
+      const AllreduceOptions& = AllreduceOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& tensors,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+ protected:
+  virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
+      int rank, CommType opType, const std::vector<Tensor>& inputs);
+
+ private:
+  std::shared_ptr<Store> store_;
+  std::shared_ptr<ProcessGroup> inner_pg_;
+  std::shared_ptr<ProcessGroupGloo> inter_pg_;
+
+  int local_rank_;
+  int local_size_;
+  bool with_switch_;
+  int gloo_rank_;
+  int gloo_size_;
+};
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 2af407f711ec1..0aa336d5830f3 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/include/api.h"
@@ -26,61 +27,6 @@ constexpr int64_t kWaitBlockTImeout = 10;
 namespace paddle {
 namespace distributed {
 
-static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
-  static const std::map<ReduceOp, ncclRedOp_t> red_type = {
-      {ReduceOp::MIN, ncclMin},
-      {ReduceOp::MAX, ncclMax},
-      {ReduceOp::SUM, ncclSum},
-      {ReduceOp::PRODUCT, ncclProd},
-  };
-  auto it = red_type.find(reduction);
-  PADDLE_ENFORCE_EQ(it != red_type.end(), true,
-                    platform::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
-                        "ncclProd | ncclSum"));
-  return it->second;
-}
-
-std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
-  const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
-  std::ostringstream oss;
-  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
-    oss << std::hex << static_cast<int>(bytes[i]);
-  }
-  return oss.str();
-}
-
-// Get the list of devices from list of tensors
-std::vector<Place> GetPlaceList(const std::vector<Tensor>& tensors) {
-  std::vector<Place> places;
-  places.reserve(tensors.size());
-  for (auto& tensor : tensors) {
-    places.push_back(tensor.inner_place());
-  }
-  return places;
-}
-
-// Get the deviceList String from the list of devices
-std::string GetKeyFromPlaces(const std::vector<Place>& places) {
-  std::string placeList;
-  for (auto& place : places) {
-    std::stringstream tmp;
-    tmp << place;
-    if (placeList.empty()) {
-      placeList += tmp.str();
-    } else {
-      placeList += "," + tmp.str();
-    }
-  }
-  return placeList;
-}
-
-bool CheckTensorsInCudaPlace(const std::vector<Tensor>& tensors) {
-  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
-    return t.place() == PlaceType::kGPU;
-  });
-}
-
 void SyncDefaultStream(
     const std::vector<Place>& places,
     std::vector<EventManager>& ncclEvents,                       // NOLINT

From 829adae97160a2ed0846357fd23a834a04805d18 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Mon, 28 Mar 2022 01:49:08 +0000
Subject: [PATCH 08/17] update

---
 .../distributed/collective/ProcessGroup.cc    |  8 +++-
 .../distributed/collective/ProcessGroup.h     | 44 +++++++++++++++----
 .../collective/ProcessGroupGloo.cc            |  6 ++-
 .../distributed/collective/ProcessGroupGloo.h |  2 +-
 .../collective/ProcessGroupHCCL.cc            |  4 +-
 .../distributed/collective/ProcessGroupHCCL.h |  3 +-
 .../collective/ProcessGroupHeter.cc           | 26 ++++++-----
 .../collective/ProcessGroupHeter.h            |  8 ++--
 .../collective/ProcessGroupNCCL.cc            |  4 +-
 .../distributed/collective/ProcessGroupNCCL.h |  3 +-
 paddle/fluid/pybind/distributed_py.cc         | 14 +++---
 11 files changed, 84 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc
index 42ca3bd5f5be4..ab118dadd5d88 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroup.cc
@@ -34,7 +34,13 @@ bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) {
 
 void ProcessGroup::Task::Synchronize() {}
 
-ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {}
+ProcessGroup::ProcessGroup(int rank, int size, int gid)
+    : rank_(rank), size_(size) {
+  if (gid != IGNORE_ID) {
+    auto map = ProcessGroupMapFromGid::getInstance();
+    map->insert(gid, this);
+  }
+}
 
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index e43d0e8c183c7..d295d65f9b911 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -31,6 +31,7 @@ constexpr auto kWaitTimeout = std::chrono::milliseconds(0);
 namespace paddle {
 namespace distributed {
 
+constexpr int IGNORE_ID = -1;
 using Tensor = paddle::experimental::Tensor;
 
 enum class CommType : std::uint8_t {
@@ -49,14 +50,6 @@ enum class CommType : std::uint8_t {
   UNKNOWN = 100,
 };
 
-struct ProcessGroupStrategy {
-  int nranks_{1};
-  int local_rank_{0};
-  std::vector<std::string> trainer_endpoints_{};
-  std::string current_endpoint_{""};
-  int nrings_{1};
-};
-
 class ProcessGroup {
  public:
   class Task {
@@ -76,7 +69,7 @@ class ProcessGroup {
     bool is_completed_ = false;
   };
 
-  explicit ProcessGroup(int rank, int size);
+  explicit ProcessGroup(int rank, int size, int gid);
   virtual ~ProcessGroup() {}
 
   int GetRank() const { return rank_; }
@@ -151,5 +144,38 @@ class ProcessGroup {
   const int size_;
 };
 
+class ProcessGroupMapFromGid {
+ public:
+  bool has(int gid) {
+    auto it = map_.find(gid);
+    return it != map_.end();
+  }
+
+  void insert(int gid, ProcessGroup* pg) {
+    PADDLE_ENFORCE_EQ(has(gid), false,
+                      platform::errors::PreconditionNotMet(
+                          "The process group with id %d doesnot exist.", gid));
+    map_[gid] = pg;
+  }
+
+  ProcessGroup* get(int gid) {
+    PADDLE_ENFORCE_EQ(has(gid), false,
+                      platform::errors::PreconditionNotMet(
+                          "The process group with id %d doesnot exist.", gid));
+    return map_.find(gid)->second;
+  }
+
+  static std::shared_ptr<ProcessGroupMapFromGid> getInstance() {
+    static auto s_instance = std::make_shared<ProcessGroupMapFromGid>();
+    return s_instance;
+  }
+
+  ProcessGroupMapFromGid() = default;
+  ~ProcessGroupMapFromGid() = default;
+
+ private:
+  std::unordered_map<int, ProcessGroup*> map_;
+};
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index cb82677a281e9..91c3bf93849e0 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -173,8 +173,10 @@ ProcessGroupGloo::GlooTask::GlooTask(int rank,
 
 ProcessGroupGloo::ProcessGroupGloo(
     const std::shared_ptr<paddle::distributed::Store>& store, int rank,
-    int world_size, const std::shared_ptr<GlooOptions> options)
-    : ProcessGroup(rank, world_size), _tag(0), _store(new GlooStore(store)) {
+    int world_size, int gid, const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size, gid),
+      _tag(0),
+      _store(new GlooStore(store)) {
   _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
   auto prefix_store =
       ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 71e0a40f8a761..f0bf872cfc9e4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -101,7 +101,7 @@ class ProcessGroupGloo : public ProcessGroup {
 
   explicit ProcessGroupGloo(
       const std::shared_ptr<paddle::distributed::Store>& store, int rank,
-      int world_size, std::shared_ptr<GlooOptions> options);
+      int world_size, int gid, std::shared_ptr<GlooOptions> options);
 
   ~ProcessGroupGloo() = default;
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
index 402f45707d6b1..b21155e09d06e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc
@@ -103,8 +103,8 @@ bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) {
 void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
 ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr<Store>& store,
-                                   int rank, int size)
-    : ProcessGroup(rank, size), store_(store) {}
+                                   int rank, int size, int gid)
+    : ProcessGroup(rank, size, gid), store_(store) {}
 
 void ProcessGroupHCCL::BroadcastUniqueHCCLID(
     std::vector<HcclRootInfo>& hccl_ids) {  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
index 83d509be2cdd7..932ae75fc6b9d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h
@@ -70,7 +70,8 @@ class ProcessGroupHCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size);
+  ProcessGroupHCCL(const std::shared_ptr<Store>& store, int rank, int size,
+                   int gid);
 
   const std::string GetBackendName() const override {
     return std::string(HCCL_BACKEND_NAME);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 42ea0d46cd6da..3854efc1e6aee 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -45,31 +45,37 @@ bool ProcessGroupHeter::HeterTask::Wait(std::chrono::milliseconds timeout) {
 }
 
 ProcessGroupHeter::ProcessGroupHeter(const std::shared_ptr<Store>& store,
-                                     int rank, int size, int local_rank,
-                                     int local_size, bool with_switch,
-                                     int gloo_rank, int gloo_size)
-    : ProcessGroup(rank, size),
+                                     int rank, int size, int gid,
+                                     int local_rank, int local_size,
+                                     int gloo_rank, int gloo_size,
+                                     bool with_switch,
+                                     std::string switch_endpoint)
+    : ProcessGroup(rank, size, gid),
       store_(store),
       local_rank_(local_rank),
       local_size_(local_size),
-      with_switch_(with_switch),
       gloo_rank_(gloo_rank),
-      gloo_size_(gloo_size) {
+      gloo_size_(gloo_size),
+      with_switch_(with_switch) {
 #if defined(PADDLE_WITH_NCCL)
-  inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size);
+  inner_pg_ = std::make_shared<ProcessGroupNCCL>(store, local_rank, local_size,
+                                                 IGNORE_ID);
 #elif defined(PADDLE_WITH_ASCEND_CL)
-  inner_pg_ = std::make_shared<ProcessGroupHCCL>(store, local_rank, local_size);
+  inner_pg_ = std::make_shared<ProcessGroupHCCL>(store, local_rank, local_size,
+                                                 IGNORE_ID);
 #else
   PADDLE_THROW(platform::errors::InvalidArgument(
       "ProcessGroupHeter only supports NCCL and HCCL now.");
 #endif
   if (with_switch_) {
     // TODO(sandyhouse) starts a client to connect the cloud switch module
+    // std::shared_ptr<HeterClient> client_ =
+    // HeterClient::GetInstance({switch_endpoint}, {}, 0);
   } else if (local_rank_ == 0) {
     auto opts = ProcessGroupGloo::GlooOptions::create();
     opts->device = ProcessGroupGloo::createDefaultDevice();
-    inter_pg_ =
-        std::make_shared<ProcessGroupGloo>(store, gloo_rank_, gloo_size_, opts);
+    inter_pg_ = std::make_shared<ProcessGroupGloo>(store, gloo_rank_,
+                                                   gloo_size_, IGNORE_ID, opts);
   }
 }
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index 95b6961e23f21..7a7ebaf604b57 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -23,6 +23,7 @@
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -71,8 +72,9 @@ class ProcessGroupHeter : public ProcessGroup {
   };
 
   ProcessGroupHeter(const std::shared_ptr<Store>& store, int rank, int size,
-                    int local_rank, int local_size, bool with_switch,
-                    int gloo_rank, int gloo_size);
+                    int gid, int local_rank, int local_size, int gloo_rank,
+                    int gloo_size, bool with_switch,
+                    std::string switch_endpoints);
 
   const std::string GetBackendName() const override {
     return std::string(HETER_BACKEND_NAME);
@@ -97,9 +99,9 @@ class ProcessGroupHeter : public ProcessGroup {
 
   int local_rank_;
   int local_size_;
-  bool with_switch_;
   int gloo_rank_;
   int gloo_size_;
+  bool with_switch_;
 };
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 0aa336d5830f3..7c0752b5f367c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -103,8 +103,8 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
 void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); }
 
 ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
-                                   int rank, int size)
-    : ProcessGroup(rank, size), store_(store) {}
+                                   int rank, int size, int gid)
+    : ProcessGroup(rank, size, gid), store_(store) {}
 
 void ProcessGroupNCCL::BroadcastUniqueNCCLID(
     std::vector<ncclUniqueId>& nccl_ids) {  // NOLINT
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index aa2a2b8fa2088..4ab5374dacaf4 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -76,7 +76,8 @@ class ProcessGroupNCCL : public ProcessGroup {
    private:
   };
 
-  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size);
+  ProcessGroupNCCL(const std::shared_ptr<Store>& store, int rank, int size,
+                   int gid);
 
   const std::string GetBackendName() const override {
     return std::string(NCCL_BACKEND_NAME);
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index e89d8d96342e7..9f71001793da3 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -213,7 +213,8 @@ void BindDistributed(py::module *m) {
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
+                    int>(),
            py::call_guard<py::gil_scoped_release>());
 #endif
 
@@ -221,7 +222,8 @@ void BindDistributed(py::module *m) {
   py::class_<distributed::ProcessGroupHCCL,
              std::shared_ptr<distributed::ProcessGroupHCCL>>(
       *m, "ProcessGroupHCCL", ProcessGroup)
-      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int>(),
+      .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
+                    int>(),
            py::call_guard<py::gil_scoped_release>());
 #endif
 
@@ -238,10 +240,10 @@ void BindDistributed(py::module *m) {
   py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
       *m, "ProcessGroupGloo", ProcessGroup)
       .def(py::init<const std::shared_ptr<paddle::distributed::Store> &, int,
-                    int, std::shared_ptr<GlooOptions> &>(),
+                    int, int, std::shared_ptr<GlooOptions> &>(),
            py::call_guard<py::gil_scoped_release>())
       .def(py::init([](const std::shared_ptr<paddle::distributed::Store> &store,
-                       int rank, int world_size) {
+                       int rank, int world_size, int gid) {
              auto opts = GlooOptions::create();
              char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
              if (ifname && strlen(ifname) > 1) {
@@ -251,10 +253,10 @@ void BindDistributed(py::module *m) {
                opts->device = ProcessGroupGloo::createDefaultDevice();
              }
              return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
-                                                       opts);
+                                                       gid, opts);
            }),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::call_guard<py::gil_scoped_release>())
+           py::arg("group_id"), py::call_guard<py::gil_scoped_release>())
       .def_static("create_default_device",
                   &ProcessGroupGloo::createDefaultDevice);
 #endif

From 67d84d9a5d610a548ba017fbfeac6435b5664687 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Mon, 28 Mar 2022 06:59:51 +0000
Subject: [PATCH 09/17] update

---
 .../distributed/collective/ProcessGroup.h     |  6 +++++
 .../collective/ProcessGroupHeter.cc           | 27 +++++++++++++++++++
 .../collective/ProcessGroupHeter.h            |  2 ++
 .../operators/collective/c_broadcast_op.cu.cc |  9 +++++++
 4 files changed, 44 insertions(+)

diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index d295d65f9b911..4e9eb7b664c41 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -92,6 +92,12 @@ class ProcessGroup {
         "ProcessGroup%s does not support broadcast", GetBackendName()));
   }
 
+  void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast for static",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index 3854efc1e6aee..e61c4c2f6410f 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -187,5 +187,32 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
   return CreateTask(rank_, CommType::BROADCAST, tensors);
 }
 
+void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in,
+                                  phi::DenseTensor* out) {
+  // Step1: do broadcast in inner cluster
+  auto b_opts = BroadcastOptions();
+  b_opts.source_root = 0;
+  inner_pg_->Broadcast(in, out);
+
+  if (local_rank_ == 0) {
+    Tensor cpu_tensor;
+    auto dense_cpu_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(cpu_tensor.impl());
+    dense_cpu_tensor->Resize(in->dims());
+    framework::TensorCopySync(*in, platform::CPUPlace(),
+                              dense_cpu_tensor.get());
+    if (with_switch_) {
+      // TODO(sandyhouse) send to and recv
+    } else {
+      auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts);
+      gloo_task->Wait();
+    }
+    framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(),
+                              out);
+  }
+  auto broadcast_task = inner_pg_->Broadcast(out, out);
+  broadcast_task->Wait();
+}
+
 }  //  namespace distributed
 }  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index 7a7ebaf604b57..e8290a0c87917 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -88,6 +88,8 @@ class ProcessGroupHeter : public ProcessGroup {
       std::vector<Tensor>& tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) override;
+
  protected:
   virtual std::shared_ptr<ProcessGroupHeter::HeterTask> CreateTask(
       int rank, CommType opType, const std::vector<Tensor>& inputs);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index b16f256ee6cf3..0ad61bb16b51e 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/phi/api/include/tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -36,6 +38,13 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
     auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto map = distributed::ProcessGroupMapFromGid::getInstance();
+    if (map->has(rid)) {
+      // Use ProcessGroup
+      distributed::ProcessGroup* pg = map->get(rid);
+      pg->Broadcast(x, out);
+      return;
+    }
 
     gpuStream_t stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {

From 8922319c774377e2d5f1a554da7e15b39c09145e Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Mon, 28 Mar 2022 18:03:06 +0800
Subject: [PATCH 10/17] update

---
 .../fluid/distributed/collective/HCCLTools.cc |  7 +++++++
 .../distributed/collective/ProcessGroup.h     |  2 +-
 .../collective/ProcessGroupHeter.cc           | 19 +++++--------------
 .../collective/ProcessGroupHeter.h            |  4 ++--
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/distributed/collective/HCCLTools.cc b/paddle/fluid/distributed/collective/HCCLTools.cc
index c9f69447340db..526a683e057c0 100644
--- a/paddle/fluid/distributed/collective/HCCLTools.cc
+++ b/paddle/fluid/distributed/collective/HCCLTools.cc
@@ -13,6 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/collective/HCCLTools.h"
+#include "paddle/fluid/distributed/collective/Types.h"
+
+namespace paddle {
+namespace distributed {
 
 HcclReduceOp ToHCCLRedType(ReduceOp reduction) {
   static const std::map<ReduceOp, HcclReduceOp> red_type = {
@@ -37,3 +41,6 @@ std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) {
   }
   return oss.str();
 }
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 4e9eb7b664c41..36a00a7d31758 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -92,7 +92,7 @@ class ProcessGroup {
         "ProcessGroup%s does not support broadcast", GetBackendName()));
   }
 
-  void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) {
+  virtual void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support broadcast for static",
         GetBackendName()));
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
index e61c4c2f6410f..ffd653042494d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc
@@ -85,10 +85,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::AllReduce(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  PADDLE_ENFORCE_EQ(
-      CheckTensorsInNPUPlace(tensors), true,
-      platform::errors::InvalidArgument("All inputs should be in NPUPlace."));
 #endif
 
   // Step1: do allreduce in inner cluster
@@ -143,10 +139,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(tensors), true,
       platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
-#elif defined(PADDLE_WITH_ASCEND_CL)
-  PADDLE_ENFORCE_EQ(
-      CheckTensorsInNPUPlace(tensors), true,
-      platform::errors::InvalidArgument("All inputs should be in NPUPlace."));
 #endif
 
   // Step1: do broadcast in inner cluster
@@ -190,8 +182,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupHeter::Broadcast(
 void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in,
                                   phi::DenseTensor* out) {
   // Step1: do broadcast in inner cluster
-  auto b_opts = BroadcastOptions();
-  b_opts.source_root = 0;
   inner_pg_->Broadcast(in, out);
 
   if (local_rank_ == 0) {
@@ -204,14 +194,15 @@ void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in,
     if (with_switch_) {
       // TODO(sandyhouse) send to and recv
     } else {
-      auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts);
-      gloo_task->Wait();
+      std::vector<Tensor> cpu_tensors = {cpu_tensor};
+      // auto gloo_task = inter_pg_->Broadcast(cpu_tensors);
+      // gloo_task->Wait();
+      inter_pg_->Broadcast(cpu_tensors);
     }
     framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(),
                               out);
   }
-  auto broadcast_task = inner_pg_->Broadcast(out, out);
-  broadcast_task->Wait();
+  inner_pg_->Broadcast(out, out);
 }
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index e8290a0c87917..9b4cf3f2056da 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -23,8 +23,7 @@
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
-#include "paddle/fluid/distributed/ps/service/heter_client.h"
-#include "paddle/fluid/platform/cuda_device_guard.h"
+// #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/fluid/distributed/store/store.h"
@@ -36,6 +35,7 @@
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 

From e73ca14acc8aca6178324926b1f204440c80b902 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Mon, 28 Mar 2022 18:06:54 +0800
Subject: [PATCH 11/17] update

---
 paddle/fluid/distributed/collective/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 26c06f2ec20a3..79f3e7f1e86db 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -11,5 +11,5 @@ if(WITH_NCCL)
 endif()
 if(WITH_ASCEND_CL)
   cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
-  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
 endif()

From d0179a53514ffbe84796c91411930678e175d6cc Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Mon, 28 Mar 2022 18:56:26 +0800
Subject: [PATCH 12/17] update

---
 paddle/fluid/pybind/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 7b223f7ed27e2..69a928fc50d3d 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -91,12 +91,14 @@ if(NOT ON_INFER)
   set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
   endif()
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
   if(WITH_ASCEND_CL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()

From f4bc987157948be7836c894d0ce32020b46c63bf Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Tue, 29 Mar 2022 03:07:26 +0000
Subject: [PATCH 13/17] update

---
 paddle/fluid/distributed/collective/CMakeLists.txt      | 8 ++++++--
 paddle/fluid/distributed/collective/ProcessGroupHeter.h | 4 ++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 26c06f2ec20a3..d2002bb09e139 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -7,9 +7,13 @@ endif()
 
 if(WITH_NCCL)
   cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
-  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  if (WITH_DISTRIBUTE)
+    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc NCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  endif()
 endif()
 if(WITH_ASCEND_CL)
   cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
-  cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  if (WITH_DISTRIBUTE)
+    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+  endif()
 endif()
diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
index e8290a0c87917..f813ab587b9fd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h
@@ -27,6 +27,10 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 #include "paddle/fluid/distributed/store/store.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"

From 83c6fef6dab2a61334ed68dc2c48fa22dc30fa42 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Tue, 29 Mar 2022 03:17:51 +0000
Subject: [PATCH 14/17] update

---
 paddle/fluid/pybind/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 69a928fc50d3d..7b223f7ed27e2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -91,14 +91,12 @@ if(NOT ON_INFER)
   set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
-    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
   endif()
   if (WITH_GLOO)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
   endif()
   if(WITH_ASCEND_CL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl)
-    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_heter)
   endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()

From 0df9a69009800f3c39359e27e0bbb3f187ed5417 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Tue, 29 Mar 2022 03:30:39 +0000
Subject: [PATCH 15/17] update

---
 paddle/fluid/distributed/collective/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index a2059a11c3afb..6fb805a72e4de 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -15,6 +15,6 @@ endif()
 if(WITH_ASCEND_CL)
   cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
   if (WITH_DISTRIBUTE)
-    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
+    cc_library(processgroup_heter SRCS ProcessGroupHeter.cc HCCLTools.cc Common.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api)
   endif()
 endif()

From 4f0ff5d9431d52c9bd13bce07e65fd1244d56e44 Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Tue, 29 Mar 2022 05:01:57 +0000
Subject: [PATCH 16/17] update

---
 paddle/fluid/pybind/distributed_py.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 9f71001793da3..d74b11c069fbb 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -256,7 +256,7 @@ void BindDistributed(py::module *m) {
                                                        gid, opts);
            }),
            py::arg("store"), py::arg("rank"), py::arg("world_size"),
-           py::arg("group_id"), py::call_guard<py::gil_scoped_release>())
+           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>())
       .def_static("create_default_device",
                   &ProcessGroupGloo::createDefaultDevice);
 #endif

From 3ef390b5499747d20cdee3bdc00e0f74100d6b4d Mon Sep 17 00:00:00 2001
From: sandyhouse <lilong12@baidu.com>
Date: Wed, 30 Mar 2022 04:31:47 +0000
Subject: [PATCH 17/17] update

---
 paddle/fluid/pybind/distributed_py.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index d74b11c069fbb..6c74ea2eef4d0 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -215,7 +215,8 @@ void BindDistributed(py::module *m) {
       *m, "ProcessGroupNCCL", ProcessGroup)
       .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
                     int>(),
-           py::call_guard<py::gil_scoped_release>());
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
+           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>());
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -224,7 +225,8 @@ void BindDistributed(py::module *m) {
       *m, "ProcessGroupHCCL", ProcessGroup)
       .def(py::init<const std::shared_ptr<distributed::Store> &, int, int,
                     int>(),
-           py::call_guard<py::gil_scoped_release>());
+           py::arg("store"), py::arg("rank"), py::arg("world_size"),
+           py::arg("group_id") = 0, py::call_guard<py::gil_scoped_release>());
 #endif
 
   py::class_<distributed::ProcessGroup::Task,