From 0efcae8676869d923eb3beca5259549e8b0776a0 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Wed, 12 Jan 2022 21:09:38 +0800
Subject: [PATCH 01/24] [part 3]change type of function args (#38887)

* code clean

* [part 3]change type of function args
---
 .../fluid/operators/controlflow/bitwise_op.h  | 30 ++++++-------
 .../operators/controlflow/compare_all_op.h    |  2 +-
 .../fluid/operators/controlflow/compare_op.h  | 12 +++---
 .../fluid/operators/controlflow/logical_op.cu | 28 ++-----------
 .../fluid/operators/controlflow/logical_op.h  | 42 ++++++++-----------
 5 files changed, 44 insertions(+), 70 deletions(-)
diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h
index 92abe4cd3b1c3..9e652f9200747 100644
--- a/paddle/fluid/operators/controlflow/bitwise_op.h
+++ b/paddle/fluid/operators/controlflow/bitwise_op.h
@@ -22,19 +22,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                          \
-  template <typename T>                                                        \
-  struct Bitwise##func##Functor {                                              \
-    using ELEM_TYPE = T;                                                       \
-    HOSTDEVICE T operator()(const T& a, const T& b) const { return a expr b; } \
-  };                                                                           \
-                                                                               \
-  template <>                                                                  \
-  struct Bitwise##func##Functor<bool> {                                        \
-    using ELEM_TYPE = bool;                                                    \
-    HOSTDEVICE bool operator()(const bool& a, const bool& b) const {           \
-      return a bool_expr b;                                                    \
-    }                                                                          \
+#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr)                        \
+  template <typename T>                                                      \
+  struct Bitwise##func##Functor {                                            \
+    using ELEM_TYPE = T;                                                     \
+    HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \
+  };                                                                         \
+                                                                             \
+  template <>                                                                \
+  struct Bitwise##func##Functor<bool> {                                      \
+    using ELEM_TYPE = bool;                                                  \
+    HOSTDEVICE bool operator()(const bool a, const bool b) const {           \
+      return a bool_expr b;                                                  \
+    }                                                                        \
   };
 
 BITWISE_BINARY_FUNCTOR(And, &, &&)
@@ -45,13 +45,13 @@ BITWISE_BINARY_FUNCTOR(Xor, ^, !=)
 template <typename T>
 struct BitwiseNotFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE T operator()(const T& a) const { return ~a; }
+  HOSTDEVICE T operator()(const T a) const { return ~a; }
 };
 
 template <>
 struct BitwiseNotFunctor<bool> {
   using ELEM_TYPE = bool;
-  HOSTDEVICE bool operator()(const bool& a) const { return !a; }
+  HOSTDEVICE bool operator()(const bool a) const { return !a; }
 };
 
 template <typename DeviceContext, typename Functor>
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h
index bcad240601cf6..78a7b76e3fd9d 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.h
+++ b/paddle/fluid/operators/controlflow/compare_all_op.h
@@ -28,7 +28,7 @@ namespace operators {
 template <typename T>
 struct EqualReduceFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     if (std::is_floating_point<T>::value) {
       // This branch will be optimized while compiling if T is integer. It is
       // safe to cast a and b to double.
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index 36185322a96b8..d2ef4c9befba9 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -25,31 +25,31 @@ namespace operators {
 template <typename T>
 struct LessThanFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a < b; }
 };
 
 template <typename T>
 struct LessEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a <= b; }
 };
 
 template <typename T>
 struct GreaterThanFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a > b; }
 };
 
 template <typename T>
 struct GreaterEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+  HOSTDEVICE bool operator()(const T a, const T b) const { return a >= b; }
 };
 
 template <typename T>
 struct EqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     if (std::is_floating_point<T>::value) {
       // This branch will be optimized while compiling if T is integer. It is
       // safe to cast a and b to double.
@@ -63,7 +63,7 @@ struct EqualFunctor {
 template <typename T>
 struct NotEqualFunctor {
   using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+  HOSTDEVICE bool operator()(const T a, const T b) const {
     return !EqualFunctor<T>()(a, b);
   }
 };
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 301b4c4149fad..4a3fc6c895174 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -18,26 +18,6 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                          \
-  template <typename T>                                                \
-  struct func_name {                                                   \
-    using ELEMENT_TYPE = T;                                            \
-    HOSTDEVICE bool operator()(const T* args) const {                  \
-      return static_cast<bool>(args[0]) op static_cast<bool>(args[1]); \
-    }                                                                  \
-  };
-
-LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct CudaNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T* args) const { return !args[0]; }
-};
-
 template <typename Functor>
 class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -76,8 +56,8 @@ class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
       ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
       ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
 
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
 #undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 92fe0a10cb907..ee63da60fcd0f 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -19,38 +19,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct LogicalAndFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
-};
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
+  template <typename T>                                      \
+  struct func_name {                                         \
+    using ELEMENT_TYPE = T;                                  \
+    HOSTDEVICE bool operator()(const T a, const T b) const { \
+      return static_cast<bool>(a) op static_cast<bool>(b);   \
+    }                                                        \
+  };
 
-template <typename T>
-struct LogicalOrFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
-};
+LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
 
 template <typename T>
 struct LogicalNotFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  using ELEM_TYPE = T;
-  HOSTDEVICE bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T a) const { return !a; }
 };
 
 template <typename DeviceContext, typename Functor>
 class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
+    using T = typename Functor::ELEMENT_TYPE;
     auto* x = context.Input<framework::Tensor>("X");
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
@@ -62,10 +56,10 @@ class BinaryLogicalOpKernel
 
 template <typename DeviceContext, typename Functor>
 class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
+    using T = typename Functor::ELEMENT_TYPE;
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     Functor unary_func;

From 277cf900fb49a28e7d7818addbb863f2b62d3ef5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 13 Jan 2022 10:23:12 +0800
Subject: [PATCH 02/24] splits allocation for pten, test=develop (#38853)

---
 paddle/fluid/framework/operator.h             |   4 +-
 paddle/fluid/framework/tensor.cc              |   8 --
 paddle/fluid/framework/tensor.h               |   8 --
 paddle/fluid/framework/tensor_util.cc         |   3 +-
 paddle/fluid/framework/tensor_util.h          |   9 +-
 .../inference/api/details/zero_copy_tensor.cc |   7 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   5 +-
 .../memory/allocation/aligned_allocator.cc    |  14 ++-
 .../memory/allocation/aligned_allocator.h     |   4 +-
 paddle/fluid/memory/allocation/allocator.cc   |   9 +-
 paddle/fluid/memory/allocation/allocator.h    | 105 ++++++-----------
 .../memory/allocation/allocator_facade.cc     |  34 +++---
 .../memory/allocation/allocator_facade.h      |   1 +
 .../auto_growth_best_fit_allocator.cc         |  11 +-
 .../auto_growth_best_fit_allocator.h          |   8 +-
 .../auto_growth_best_fit_allocator_test.cc    |   8 +-
 .../fluid/memory/allocation/base_ptr_test.cu  |   8 +-
 .../memory/allocation/best_fit_allocator.cc   |   6 +-
 .../memory/allocation/best_fit_allocator.h    |   8 +-
 .../memory/allocation/buffered_allocator.cc   |   7 +-
 .../memory/allocation/buffered_allocator.h    |   4 +-
 .../allocation/buffered_allocator_test.cc     |   6 +-
 .../fluid/memory/allocation/cpu_allocator.cc  |   4 +-
 .../fluid/memory/allocation/cpu_allocator.h   |   4 +-
 .../fluid/memory/allocation/cuda_allocator.cc |   4 +-
 .../fluid/memory/allocation/cuda_allocator.h  |   4 +-
 .../cuda_device_context_allocator.h           |  14 +--
 .../allocation/cuda_virtual_mem_allocator.cc  |   4 +-
 .../allocation/cuda_virtual_mem_allocator.h   |   4 +-
 .../memory/allocation/locked_allocator.cc     |   4 +-
 .../memory/allocation/locked_allocator.h      |   4 +-
 .../allocation/naive_best_fit_allocator.cc    |   4 +-
 .../allocation/naive_best_fit_allocator.h     |   4 +-
 .../fluid/memory/allocation/npu_allocator.cc  |   4 +-
 .../fluid/memory/allocation/npu_allocator.h   |   4 +-
 .../memory/allocation/npu_pinned_allocator.cc |   8 +-
 .../memory/allocation/npu_pinned_allocator.h  |   8 +-
 .../memory/allocation/pinned_allocator.cc     |   4 +-
 .../memory/allocation/pinned_allocator.h      |   4 +-
 .../memory/allocation/retry_allocator.cc      |   4 +-
 .../fluid/memory/allocation/retry_allocator.h |   4 +-
 .../memory/allocation/retry_allocator_test.cc |   4 +-
 .../allocation/stream_safe_cuda_allocator.cc  |   9 +-
 .../allocation/stream_safe_cuda_allocator.h   |   8 +-
 .../allocation/test_aligned_allocator.cc      |   4 +-
 .../allocation/thread_local_allocator.h       |   4 +-
 ...l_memory_auto_growth_best_fit_allocator.cc |   8 +-
 ...al_memory_auto_growth_best_fit_allocator.h |   6 +-
 paddle/fluid/memory/malloc.h                  |   2 +-
 .../fluid/operators/math/concat_and_split.cu  |  10 +-
 .../device/mlu/device_context_allocator.h     |   6 +-
 .../fluid/platform/device/npu/npu_op_runner.h |   3 +-
 paddle/fluid/pybind/eager_functions.cc        |   2 +-
 paddle/pten/api/lib/utils/CMakeLists.txt      |   2 +-
 paddle/pten/api/lib/utils/allocator.cc        |  23 ----
 paddle/pten/api/lib/utils/allocator.h         |   8 +-
 paddle/pten/api/lib/utils/storage.cc          |   5 +-
 paddle/pten/api/lib/utils/tensor_utils.cc     |   2 +-
 paddle/pten/core/allocator.h                  |   3 +
 paddle/pten/core/candidate/allocator.h        | 107 ++++++++++++++++++
 paddle/pten/core/dense_tensor.h               |   2 +
 paddle/pten/core/storage.h                    |   1 +
 paddle/pten/tests/core/allocator.h            |   7 +-
 paddle/pten/tests/core/test_allocator.cc      |   4 +
 tools/check_file_diff_approvals.sh            |  19 +---
 65 files changed, 328 insertions(+), 292 deletions(-)
 delete mode 100644 paddle/pten/api/lib/utils/allocator.cc
 create mode 100644 paddle/pten/core/candidate/allocator.h

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a46c83a2b3ad..09e4abc77f573 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -410,8 +410,8 @@ class ExecutionContext {
     auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
     auto& deleter = tmp_allocation_ptr.get_deleter();
     auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
-        allocation_ptr, deleter);
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
 
     PADDLE_ENFORCE_GE(
         allocation_ptr->size(), framework::product(dim) * sizeof(T),
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index f11b37825d4f0..6aa10a058081b 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -17,14 +17,6 @@ limitations under the License. */
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index e86009e9aafea..fcdb837bc80ce 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -32,14 +32,6 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 7fd125834a0c3..5fd581220097b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 46eba6a1e41bb..11858e4166595 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -241,8 +240,7 @@ void TensorFromVector(const std::vector<T>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -312,8 +310,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 01d4dbccd50ea..2f2f4c0ead760 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
   auto t_place = tensor->place();
 
   paddle::framework::Tensor out;
-  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
-      static_cast<void *>(data), ele_num * sizeof(T),
-      paddle::platform::CPUPlace());
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
   out.ResetHolder(mem_allocation);
 
   if (paddle::platform::is_cpu_place(t_place)) {
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index b1e0eb5ef16ab..0d5cd29a0c579 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   size_t memory_size =
       GetLiteTensorNumel(*src) *
       framework::SizeOfType(GetNativePrecisionType(src->precision()));
-  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, memory_size,
-                                         GetNativePlace(src->target())));
+  std::shared_ptr<pten::Allocation> holder(new pten::Allocation(
+      src_raw_data, memory_size, GetNativePlace(src->target())));
   dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision()));
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index 10380c0d6028d..258cff32b4fca 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -23,7 +23,7 @@ namespace allocation {
 // For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
-  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
+  AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset)
       : Allocation(
             reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
             underlying_allocation->base_ptr(),
@@ -32,7 +32,7 @@ class AlignedAllocation : public Allocation {
         underlying_allocation_(std::move(underlying_allocation)) {}
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
 };
 
 AlignedAllocator::AlignedAllocator(
@@ -52,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const {
   return underlying_allocator_->IsAllocThreadSafe();
 }
 
-Allocation* AlignedAllocator::AllocateImpl(size_t size) {
+pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) {
   auto raw_allocation = underlying_allocator_->Allocate(size + alignment_);
   size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_);
-  return new AlignedAllocation(std::move(raw_allocation), offset);
+  auto* p = new AlignedAllocation(
+      static_unique_ptr_cast<Allocation>(std::move(raw_allocation)), offset);
+  return p;
 }
 
-void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; }
+void AlignedAllocator::FreeImpl(pten::Allocation* allocation) {
+  delete allocation;
+}
 
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 6fef5cae8d6af..ffd5ad0fae1b0 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation* AllocateImpl(size_t size) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation* allocation) override;
+  void FreeImpl(pten::Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 4998f3dbb9613..0ef6f5cbab5cc 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-bool Allocator::IsAllocThreadSafe() const { return false; }
-
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
+void Allocator::FreeImpl(pten::Allocation* allocation) {
+  static_cast<Allocation*>(allocation)
+      ->TopDecoratedAllocator()
+      ->Free(allocation);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index ee802462ddc94..3f04d47516377 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/allocator.h"
 
 DECLARE_string(allocator_strategy);
 
@@ -80,30 +81,19 @@ class Allocator;
  * e.g., something what is done in AlignedAllocator, etc.
  * In this case, we should declare a derived class of Allocation, which
  * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
+ * Therefore, `decorated_allocators_` of the new Allocation object
+ * would
  * be a new chain, differing from the underlying Allocation object.
  */
-class Allocation {
+class Allocation : public pten::Allocation {
  public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
-  inline Allocation(void* ptr, void* base_ptr, size_t size,
-                    platform::Place place)
-      : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}
-
-  Allocation(const Allocation& o) = delete;
-  Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
-
-  inline void* base_ptr() const {
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : pten::Allocation(ptr, size, place), base_ptr_(ptr) {}
+  Allocation(void* ptr, void* base_ptr, size_t size,
+             const platform::Place& place)
+      : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
+
+  void* base_ptr() const {
     PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
                       paddle::platform::errors::Unimplemented(
                           "base_ptr() is only implemented for auto_growth "
@@ -112,21 +102,6 @@ class Allocation {
     return base_ptr_;
   }
 
-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
-
-  inline const platform::Place& place() const { return place_; }
-
-  virtual ~Allocation() {}
-
  private:
   inline void RegisterDecoratedAllocator(Allocator* allocator) {
     decorated_allocators_.emplace_back(allocator);
@@ -139,10 +114,7 @@ class Allocation {
   }
 
  private:
-  void* ptr_;
   void* base_ptr_;  // the point that directly requested from system
-  size_t size_;
-  platform::Place place_;
 
   /**
    * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
@@ -162,53 +134,42 @@ class Allocation {
   friend class Allocator;
 };
 
+using AllocationPtr = pten::Allocator::AllocationPtr;
+using DecoratedAllocationPtr =
+    std::unique_ptr<Allocation, pten::Allocator::DeleterType>;
+
 // Base interface class of memory Allocator.
-class Allocator {
+class Allocator : public pten::Allocator {
  public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
-  };
-
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  static void AllocationDeleter(pten::Allocation* allocation) {
+    Allocator* allocator =
+        static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
+    allocator->Free(allocation);
+  }
 
   // Allocate an allocation.
   // size may be 0, but it would be too complex if we handle size == 0
   // in each Allocator. So we handle size == 0 inside AllocatorFacade
   // in our design.
-  inline AllocationPtr Allocate(size_t size) {
+  AllocationPtr Allocate(size_t size) override {
     auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
+    static_cast<Allocation*>(ptr)->RegisterDecoratedAllocator(this);
+    return AllocationPtr(ptr, AllocationDeleter);
   }
 
-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
+  void Free(pten::Allocation* allocation) {
+    static_cast<Allocation*>(allocation)->PopDecoratedAllocator();
     FreeImpl(allocation);
   }
 
-  inline uint64_t Release(const platform::Place& place) {
-    return ReleaseImpl(place);
-  }
-
-  // True if the `Allocate` is thread safe.
-  virtual bool IsAllocThreadSafe() const;
+  uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); }
 
  protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
+  virtual pten::Allocation* AllocateImpl(size_t size) = 0;
+  virtual void FreeImpl(pten::Allocation* allocation);
   virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; }
 };
 
-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
-
 inline size_t AlignedSize(size_t size, size_t alignment) {
   auto remaining = size % alignment;
   return remaining == 0 ? size : size + alignment - remaining;
@@ -220,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) {
   return diff == 0 ? 0 : alignment - diff;
 }
 
+template <typename Derived, typename Base, typename BaseDel>
+decltype(auto) static_unique_ptr_cast(std::unique_ptr<Base, BaseDel>&& p) {
+  static_assert(std::is_base_of<Base, Derived>::value,
+                "Derived type must derive from Base.");
+  auto d = static_cast<Derived*>(p.release());
+  return std::unique_ptr<Derived, BaseDel>(d, p.get_deleter());
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 9bc2f5461f383..474b4fe3d4522 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -94,7 +94,7 @@ class CUDAGraphAllocator
   class PrivateAllocation : public Allocation {
    public:
     PrivateAllocation(CUDAGraphAllocator* allocator,
-                      AllocationPtr underlying_allocation)
+                      DecoratedAllocationPtr underlying_allocation)
         : Allocation(
               underlying_allocation->ptr(), underlying_allocation->base_ptr(),
               underlying_allocation->size(), underlying_allocation->place()),
@@ -103,7 +103,7 @@ class CUDAGraphAllocator
 
    private:
     std::shared_ptr<Allocator> allocator_;
-    AllocationPtr underlying_allocation_;
+    DecoratedAllocationPtr underlying_allocation_;
   };
 
   explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
@@ -116,12 +116,14 @@ class CUDAGraphAllocator
   }
 
  protected:
-  Allocation* AllocateImpl(size_t size) {
+  pten::Allocation* AllocateImpl(size_t size) {
     VLOG(10) << "Allocate " << size << " for CUDA Graph";
-    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+    return new PrivateAllocation(this,
+                                 static_unique_ptr_cast<Allocation>(
+                                     underlying_allocator_->Allocate(size)));
   }
 
-  void FreeImpl(Allocation* allocation) {
+  void FreeImpl(pten::Allocation* allocation) {
     VLOG(10) << "delete for CUDA Graph";
     delete allocation;
   }
@@ -322,7 +324,7 @@ class AllocatorFacadePrivate {
     return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
   }
 
-  void RecordStream(std::shared_ptr<Allocation> allocation,
+  void RecordStream(std::shared_ptr<pten::Allocation> allocation,
                     const gpuStream_t& stream) {
     if (allocation->size() == 0) {
       return;
@@ -339,7 +341,7 @@ class AllocatorFacadePrivate {
   }
 
   const gpuStream_t& GetStream(
-      const std::shared_ptr<Allocation>& allocation) const {
+      const std::shared_ptr<pten::Allocation>& allocation) const {
     const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
         dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
     PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
@@ -391,10 +393,10 @@ class AllocatorFacadePrivate {
     bool IsAllocThreadSafe() const override { return true; }
 
    protected:
-    Allocation* AllocateImpl(size_t size) override {
+    pten::Allocation* AllocateImpl(size_t size) override {
       return new Allocation(nullptr, 0, place_);
     }
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
+    void FreeImpl(pten::Allocation* allocation) override { delete allocation; }
 
    private:
     platform::Place place_;
@@ -820,9 +822,9 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
@@ -866,7 +868,7 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
       ->Release(place);
 }
 
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
@@ -884,14 +886,14 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
   }
 #endif
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
-  return std::shared_ptr<Allocation>(Alloc(place, size, s));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size, s));
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
 #endif
 }
 
 bool AllocatorFacade::InSameStream(
-    const std::shared_ptr<Allocation>& allocation,
+    const std::shared_ptr<pten::Allocation>& allocation,
     const platform::Stream& stream) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_EQ(
@@ -962,7 +964,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
   return m_->GetAllocator(place, stream)->Release(place);
 }
 
-void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
+void AllocatorFacade::RecordStream(std::shared_ptr<pten::Allocation> allocation,
                                    const gpuStream_t& stream) {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
@@ -983,7 +985,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr<Allocation> allocation,
 }
 
 const gpuStream_t& AllocatorFacade::GetStream(
-    const std::shared_ptr<Allocation>& allocation) const {
+    const std::shared_ptr<pten::Allocation>& allocation) const {
   PADDLE_ENFORCE_EQ(
       FLAGS_use_stream_safe_cuda_allocator, true,
       platform::errors::Unimplemented(
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index d59ecaece5a70..76e2f0b5a94f6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -42,6 +42,7 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
  public:
+  using Allocation = pten::Allocation;
   AllocatorFacade(const AllocatorFacade& o) = delete;
   const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
   ~AllocatorFacade();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index dd2a65d889d8d..ad62af8480f58 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
+    size_t unaligned_size) {
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     size_t realloc_size = std::max(size, chunk_size_);
 
     try {
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     } catch (BadAlloc &ex) {
       if (FLAGS_free_when_no_cache_hit) throw ex;
       FreeIdleChunks();
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     }
 
     auto *chunk = &(*chunks_.rbegin());
@@ -104,7 +107,7 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
   return new BlockAllocation(block_it);
 }
 
-void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
   VLOG(10) << "Free " << allocation->size()
            << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 2334a1b6d4d55..94aff93ec50f8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;
 
   // Release the memory block which is not used in pool.
   uint64_t ReleaseImpl(const platform::Place &place) override {
@@ -64,10 +64,10 @@ class AutoGrowthBestFitAllocator : public Allocator {
   };
 
   struct Chunk {
-    explicit Chunk(AllocationPtr allocation)
+    explicit Chunk(DecoratedAllocationPtr allocation)
         : allocation_(std::move(allocation)) {}
 
-    AllocationPtr allocation_;
+    DecoratedAllocationPtr allocation_;
     List<Block> blocks_;
   };
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 926af8292d2e8..5942fbe730e57 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -28,12 +28,12 @@ namespace allocation {
 
 class RecordedAllocator : public Allocator {
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     allocated_size_ += size;
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
@@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator {
   size_t AllocatedSize() const { return allocated_size_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     if (allocated_size_ + size > capacity_) {
       throw BadAlloc("", __FILE__, __LINE__);
     }
@@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator {
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu
index a34750a5e34ba..5edabfcb9f5e7 100644
--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -37,7 +37,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
       size_t size = dis_(random_engine_);
       AllocationPtr allocation = Alloc(place_, size);
 
-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -56,7 +56,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
         size_t size = dis_(random_engine_);
         AllocationPtr allocation = Alloc(place_, size);
 
-        void* base_ptr = allocation->base_ptr();
+        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
         void* system_ptr =
             platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
         EXPECT_EQ(base_ptr, system_ptr);
@@ -77,7 +77,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
       size_t size = dis_(random_engine_);
       AllocationPtr allocation = Alloc(place_, size);
 
-      void* base_ptr = allocation->base_ptr();
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
       void* system_ptr =
           platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
       EXPECT_EQ(base_ptr, system_ptr);
@@ -91,7 +91,7 @@ class CUDAAllocatoionBasePtrTest : public ::testing::Test {
 
   void ZeroSizeAllocTest() {
     AllocationPtr allocation = Alloc(place_, 0);
-    void* base_ptr = allocation->base_ptr();
+    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
     void* system_ptr =
         platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
     EXPECT_EQ(base_ptr, system_ptr);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 0955b5212622f..3cba70bd5b502 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) {
   }
 }
 
-BestFitAllocator::BestFitAllocator(Allocation* allocation)
+BestFitAllocator::BestFitAllocator(pten::Allocation* allocation)
     : allocation_(allocation) {
   details::Chunk chunk;
   chunk.size_ = allocation_->size();
@@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(pten::Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(
       bf_allocation,
@@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) {
   InsertFreeNode(chunk_it);
   delete allocation;
 }
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) {
   auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
   MapIt map_it;
   for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 42f69e6d704af..297d876178f3d 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation {
 // the prev-chunk and the next-chunk when possible.
 class BestFitAllocator : public Allocator {
  public:
-  explicit BestFitAllocator(Allocation* allocation);
+  explicit BestFitAllocator(pten::Allocation* allocation);
 
   void* BasePtr() const { return allocation_->ptr(); }
 
@@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
-  Allocation* allocation_;  // not owned
+  pten::Allocation* allocation_;  // not owned
   details::ChunkList chunks_;
   details::FreeChunkBin free_chunks_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 325cb010bf466..11739ebba955f 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) {
 
 bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
 
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+void BufferedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  allocations_.emplace(allocation->size(), AllocationPtr(allocation));
+  allocations_.emplace(allocation->size(),
+                       AllocationPtr(allocation, Allocator::AllocationDeleter));
 }
 
-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
     auto it = allocations_.lower_bound(size);
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index 5e1733bd839de..0ccccef573963 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 0bfa10a1616b6..21c30efccd8ad 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -27,7 +27,7 @@ namespace memory {
 namespace allocation {
 
 inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
-    Allocation *allocation, bool thread_safe) {
+    pten::Allocation *allocation, bool thread_safe) {
   std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
   if (thread_safe) {
     allocator.reset(new LockedAllocator(std::move(allocator)));
@@ -68,7 +68,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(
         alloc, platform::errors::InvalidArgument(
@@ -77,7 +77,7 @@ class StubAllocator : public Allocator {
     ++destruct_count_;
     delete allocation;
   }
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++construct_count_;
     if (size == 0) {
       return new StubAllocation(nullptr, 0, platform::CPUPlace());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 128591f5a8d3e..bf0bd891be26f 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
+void CPUAllocator::FreeImpl(pten::Allocation *allocation) {
   void *p = allocation->ptr();
 #ifdef _WIN32
   _aligned_free(p);
@@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
-Allocation *CPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUAllocator::AllocateImpl(size_t size) {
   void *p;
 #ifdef _WIN32
   p = _aligned_malloc(size, kAlignment);
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 058ff63381658..a64089dd2de42 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -37,8 +37,8 @@ class CPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 };
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 4242083f2e617..ff9bbf4ab3df8 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -32,7 +32,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -42,7 +42,7 @@ void CUDAAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
   void* ptr;
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 5969d4d20ddee..57e85a3dc21d1 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 33cf2fe054247..a6696634c12d4 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -41,7 +41,7 @@ namespace allocation {
  */
 class CUDADeviceContextAllocation : public Allocation {
  public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
+  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
       : Allocation(allocation->ptr(), allocation->base_ptr(),
                    allocation->size(), allocation->place()),
         underlying_allocation_(std::move(allocation)) {}
@@ -56,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation {
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
       VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
     });
   }
 
@@ -65,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation {
   }
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
   const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };
 
@@ -102,14 +102,14 @@ class CUDADeviceContextAllocator : public Allocator {
   }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
             "Default stream is not set for CUDADeviceContextAllocator"));
     platform::CUDADeviceGuard guard(place_.device);
-    auto allocation =
-        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
+    auto allocation = new CUDADeviceContextAllocation(
+        static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
@@ -121,7 +121,7 @@ class CUDADeviceContextAllocator : public Allocator {
     return allocation;
   }
 
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index f4baca8288f03..2ae2cf20ee6d4 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -101,7 +101,7 @@ CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
 
 bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
 
-void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
+void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -140,7 +140,7 @@ void CUDAVirtualMemAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
   size = AlignedSize(size, granularity_);
 
   CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
index c51b56566bb02..0e1e59d200d91 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -37,8 +37,8 @@ class CUDAVirtualMemAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 6e8f870b235ff..a0c8efddbd80d 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator(
   }
 }
 
-void LockedAllocator::FreeImpl(Allocation *allocation) {
+void LockedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   underlying_allocator_->Free(allocation);
 }
 
-Allocation *LockedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *LockedAllocator::AllocateImpl(size_t size) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   return underlying_allocator_->Allocate(size).release();
 }
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 1b8418bc8494a..d17c8b24e27bd 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -29,8 +29,8 @@ class LockedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 8710bbe6ce98b..ffe7ccf9190be 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -790,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 
 namespace allocation {
 
-Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
   auto *tmp_alloc = new Allocation(ptr, size, place_);
   platform::MemEvenRecorder::Instance().PushMemRecord(
@@ -798,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   return tmp_alloc;
 }
 
-void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
+void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 474a308a064fd..b7b3647ff98c1 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index 074a900cf5463..d9fa7ec27fdde 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -22,7 +22,7 @@ namespace memory {
 namespace allocation {
 
 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(Allocation* allocation) {
+void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* NPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation* NPUAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_,
                  [this] { platform::SetNPUDeviceId(place_.device); });
 
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
index bf668973505ba..88b0c9a24bb3d 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -28,8 +28,8 @@ class NPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::NPUPlace place_;
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
index 292fe15c5d952..2389973fa9b88 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -26,7 +26,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
     platform::NPUEventQuery(event, &status);
 
     if (status == ACL_EVENT_STATUS_COMPLETE) {
-      Allocation *allocation = it->first;
+      auto *allocation = it->first;
       void *ptr = allocation->ptr();
       free(ptr);
       npu_events_.erase(it++);
@@ -38,7 +38,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
   }
 }
 
-Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   std::lock_guard<std::mutex> lock(mtx_);
   ProcessEventsAndFree();
   void *ptr;
@@ -50,7 +50,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   return new Allocation(ptr, size, platform::NPUPinnedPlace());
 }
 
-void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
   std::lock_guard<std::mutex> lock(mtx_);
   void *ptr = allocation->ptr();
   auto iter = npu_events_.find(allocation);
@@ -83,7 +83,7 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
   return static_cast<uint64_t>(0);
 }
 
-void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation,
                                      aclrtStream stream) {
   std::lock_guard<std::mutex> lock(mtx_);
   aclrtEvent event = nullptr;
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
index 1d3f8bf1e449d..716b12eea15f8 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override { return true; }
   void ProcessEventsAndFree();
-  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  void RecordEvent(pten::Allocation *allocation, aclrtStream stream);
   constexpr static size_t kAlignment = 4096UL;
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
-  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+  std::unordered_map<pten::Allocation *, aclrtEvent> npu_events_;
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index c56a7235c109c..f1175fc4374e7 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
@@ -26,7 +26,7 @@ void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 #endif
   delete allocation;
 }
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 4f535ef33734a..800e3ff3bb2e3 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 1607af3808b43..856b6c2e9a2b0 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard {
   size_t requested_size_;
 };
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+void RetryAllocator::FreeImpl(pten::Allocation* allocation) {
   // Delete underlying allocation first.
   size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
@@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) {
   }
 }
 
-Allocation* RetryAllocator::AllocateImpl(size_t size) {
+pten::Allocation* RetryAllocator::AllocateImpl(size_t size) {
   auto alloc_func = [&, this]() {
     return underlying_allocator_->Allocate(size).release();
   };
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97f17..b427a37907a67 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -45,8 +45,8 @@ class RetryAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
   uint64_t ReleaseImpl(const platform::Place& place) override {
     return underlying_allocator_->Release(place);
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 787f3d9dca377..d636c73e07a18 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -98,12 +98,12 @@ class DummyAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "Here is a test exception, always BadAlloc."));
   }
 
-  void FreeImpl(Allocation *) override {}
+  void FreeImpl(pten::Allocation *) override {}
 };
 
 TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index a4f766f1d1abc..05c6a7adaff8b 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -19,7 +19,7 @@ namespace memory {
 namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
-    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
     : Allocation(underlying_allocation->ptr(),
                  underlying_allocation->base_ptr(),
                  underlying_allocation->size(), underlying_allocation->place()),
@@ -116,7 +116,7 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
 
 bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
 
-Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
   ProcessUnfreedAllocations();
   VLOG(8) << "Try allocate " << size << " bytes";
   AllocationPtr underlying_allocation;
@@ -136,13 +136,14 @@ Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
     throw;
   }
   StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
-      std::move(underlying_allocation), default_stream_);
+      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
+      default_stream_);
   VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
           << allocation->ptr();
   return allocation;
 }
 
-void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
+void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
       dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index d84994f58a9c4..f54cdc749611a 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -34,7 +34,7 @@ namespace allocation {
 
 class StreamSafeCUDAAllocation : public Allocation {
  public:
-  StreamSafeCUDAAllocation(AllocationPtr underlying_allocation,
+  StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
                            gpuStream_t owning_stream);
   void RecordStream(const gpuStream_t &stream);
   bool CanBeFreed();
@@ -42,7 +42,7 @@ class StreamSafeCUDAAllocation : public Allocation {
   const gpuStream_t &GetOwningStream() const;
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
   std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
   gpuStream_t owning_stream_;
   SpinLock outstanding_event_map_lock_;
@@ -57,8 +57,8 @@ class StreamSafeCUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
index 3eb1f140edd84..987c7ea772d23 100644
--- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
@@ -32,12 +32,12 @@ struct StubAllocator : public Allocator {
   size_t AllocNum() const { return alloc_num_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++alloc_num_;
     return new Allocation(new uint8_t[size], size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     delete[] static_cast<uint8_t *>(allocation->ptr());
     delete allocation;
     --alloc_num_;
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index c55f579981b00..9c9306517021a 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
+  pten::Allocation* AllocateImpl(size_t size) override {
     return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
         size);
   }
-  void FreeImpl(Allocation* allocation) override {
+  void FreeImpl(pten::Allocation* allocation) override {
     auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
index 5c7e8e2d933f3..face27debe9ff 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -35,7 +35,8 @@ VirtualMemoryAutoGrowthBestFitAllocator::
       alignment_(alignment),
       place_(place) {}
 
-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(
+    size_t size) {
   std::lock_guard<SpinLock> guard(spinlock_);
   size = AlignedSize(size, alignment_);
   auto result = AllocFromFreeBlocks(size);
@@ -48,7 +49,8 @@ Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
   return result;
 }
 
-void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(
+    pten::Allocation *allocation) {
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   TryMergeBlock2Blocks(block_it);
@@ -225,7 +227,7 @@ void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) {
   }
 }
 
-Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
     size_t size) {
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
   if (iter != free_blocks_.end()) {
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
index 5171e5b3cd1bf..10bf0bbf49d5a 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -60,12 +60,12 @@ class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;
 
  private:
-  Allocation *AllocFromFreeBlocks(size_t size);
+  pten::Allocation *AllocFromFreeBlocks(size_t size);
   void ExtendAndMerge(size_t size);
   void TryMergeBlock2Blocks(std::list<Block>::iterator iter);
 
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 7069fb46203d6..8830c46a17798 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -28,7 +28,7 @@ class DeviceContext;
 
 namespace memory {
 
-using allocation::Allocation;
+using pten::Allocation;
 using allocation::Allocator;
 using allocation::AllocationPtr;
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index bc2d496a3e76a..6892f7ce4e503 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -336,9 +336,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(col_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(col_alloc_released);
     });
 #endif
   }
@@ -466,9 +465,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* cols_alloc_released = cols_alloc.release();
     context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(cols_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(cols_alloc_released);
     });
 #endif
   }
diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h
index 9deab92af5cd6..408016c0f0d99 100644
--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
@@ -55,7 +55,7 @@ class MLUDeviceContextAllocation : public Allocation {
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
       VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
     });
   }
 
@@ -91,7 +91,7 @@ class MLUDeviceContextAllocator : public Allocator {
   }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
@@ -105,7 +105,7 @@ class MLUDeviceContextAllocator : public Allocator {
     return allocation;
   }
 
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
 
  private:
   platform::MLUPlace place_;
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index e83057e682fef..c049da3b33566 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation *allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation *allocation = npu_pinned_tensor.Holder().get();
 
     npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
   } else {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 3f8923440be50..659df6b9b44de 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -53,7 +53,7 @@ size_t PyArray_Size_(PyObject* numpy_data) {
   return res;
 }
 
-class EagerNumpyAllocation : public paddle::memory::allocation::Allocation {
+class EagerNumpyAllocation : public pten::Allocation {
  public:
   explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
       : Allocation(
diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
index 4a44ad7758b56..a4db8c4b193b6 100644
--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS
+cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/api/lib/utils/allocator.cc b/paddle/pten/api/lib/utils/allocator.cc
deleted file mode 100644
index e80152431e712..0000000000000
--- a/paddle/pten/api/lib/utils/allocator.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/lib/utils/allocator.h"
-
-namespace paddle {
-namespace experimental {
-
-memory::Allocator::AllocationDeleter DefaultAllocator::deleter_;
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
index 4f5a810e400ce..a8c05b7651689 100644
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -22,14 +22,15 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-class DefaultAllocator : public pten::Allocator {
+class DefaultAllocator : public pten::deprecated::Allocator {
  public:
-  using Allocation = pten::Allocation;
+  using Allocation = pten::deprecated::Allocation;
   explicit DefaultAllocator(const paddle::platform::Place& place)
       : place_(place) {}
 
   static void Delete(Allocation* allocation) {
-    deleter_(allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
   }
 
   Allocation Allocate(size_t bytes_size) override {
@@ -42,7 +43,6 @@ class DefaultAllocator : public pten::Allocator {
 
  private:
   paddle::platform::Place place_;
-  static paddle::memory::Allocator::AllocationDeleter deleter_;
 };
 
 }  // namespace experimental
diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc
index 9ee1b9e5b7f92..6116a709d5065 100644
--- a/paddle/pten/api/lib/utils/storage.cc
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -20,14 +20,13 @@ namespace experimental {
 ExternalStorage::ExternalStorage(void* ptr,
                                  size_t size,
                                  const paddle::platform::Place& place)
-    : pten::Storage(
-          std::make_shared<paddle::memory::Allocation>(ptr, size, place)),
+    : pten::Storage(std::make_shared<pten::Allocation>(ptr, size, place)),
       size_(size) {}
 
 ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
                                  size_t delta,
                                  size_t size)
-    : Storage(std::make_shared<paddle::memory::Allocation>(
+    : Storage(std::make_shared<pten::Allocation>(
           static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
       size_(size) {
   PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 69a1fc274a28d..0b6cb8d95cc1a 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -307,7 +307,7 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->Resize(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
   auto storage = src->release();
-  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+  std::shared_ptr<pten::Allocation> holder(
       new TensorStorage(std::move(storage)));
   dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index 74455be136834..2647490c9f58b 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <cstdint>
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/candidate/allocator.h"
 
 namespace pten {
+namespace deprecated {
 
 /// \brief Encapsulates strategies for access/addressing, allocation/
 /// deallocation and construction/destruction of objects.
@@ -147,4 +149,5 @@ inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
   return a->Allocate(n);
 }
 
+}  // namespace deprecated
 }  // namespace pten
diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h
new file mode 100644
index 0000000000000..75d42c4fd15cb
--- /dev/null
+++ b/paddle/pten/core/candidate/allocator.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include "paddle/fluid/platform/place.h"
+
+namespace pten {
+
+/// \brief Fancy pointer with deleter. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(Allocation*);
+
+  Allocation() = default;
+
+  // Don't own resources, only provide access.
+  Allocation(void* data, size_t size, const Place& place)
+      : ptr_(data), size_(size), place_(place) {}
+
+  // Own resources.
+  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
+      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
+
+  Allocation(Allocation&& other) noexcept { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) noexcept {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+
+  virtual ~Allocation() {
+    if (deleter_) {
+      deleter_(this);
+    }
+  }
+
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const noexcept { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const noexcept { return size_; }
+
+  void* operator->() const noexcept { return ptr_; }
+  operator bool() const noexcept { return ptr_; }
+  const Place& place() const noexcept { return place_; }
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+ protected:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
+  void* ptr_{nullptr};
+  size_t size_{};
+  DeleterFnPtr deleter_{nullptr};
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.ptr_, b.ptr_);
+  ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
+  ::std::swap(a.size_, b.size_);
+}
+
+class Allocator {
+ public:
+  using DeleterType = std::function<void(Allocation*)>;
+  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
+
+  virtual ~Allocator() = default;
+  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
+
+  virtual bool IsAllocThreadSafe() const { return false; }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 1502accd197be..1802a2461158f 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,6 +60,8 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                     public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
+  using Allocator = deprecated::Allocator;
+
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index fc56935eeaf19..cf18dd913093a 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
  public:
   using Place = paddle::platform::Place;
+  using Allocator = deprecated::Allocator;
 
   explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
 
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index 094c0e8437d98..c2c74e1aacf1f 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
-class HostAllocatorSample : public pten::RawAllocator {
+class HostAllocatorSample : public pten::deprecated::RawAllocator {
  public:
   using Place = paddle::platform::Place;
   void* Allocate(size_t bytes_size) override {
@@ -36,8 +36,9 @@ class HostAllocatorSample : public pten::RawAllocator {
   Place place_{paddle::platform::CPUPlace()};
 };
 
-class FancyAllocator : public pten::Allocator {
+class FancyAllocator : public pten::deprecated::Allocator {
  public:
+  using Allocation = pten::deprecated::Allocation;
   static void Delete(Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
@@ -55,7 +56,7 @@ class FancyAllocator : public pten::Allocator {
 template <typename T>
 struct CustomAllocator {
   using value_type = T;
-  using Allocator = pten::RawAllocator;
+  using Allocator = pten::deprecated::RawAllocator;
 
   explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
       : alloc_(a) {}
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
index c509d8bd20a01..94ba9a1e1b9a2 100644
--- a/paddle/pten/tests/core/test_allocator.cc
+++ b/paddle/pten/tests/core/test_allocator.cc
@@ -24,6 +24,10 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
+using RawAllocator = pten::deprecated::RawAllocator;
+using Allocator = pten::deprecated::Allocator;
+using Allocation = pten::deprecated::Allocation;
+
 template <typename T>
 bool host_allocator_test(size_t vector_size) {
   std::vector<T> src(vector_size);
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e0ae600819873..caacecf446a82 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true`
 if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n"
-    check_approval 2 6888866 39303645
+    check_approval 1 6888866 39303645
   fi
 
 HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
@@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 22561442 22334008
   fi
 
-ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true`
-if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    ERROR_LINES=""
-    for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true`
-        if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n"
-        check_approval 2 6888866 39303645
-    fi
-fi
-
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"

From 5e5157812d0284f265c4d927b85d66b5bfb9c6d2 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Thu, 13 Jan 2022 11:06:09 +0800
Subject: [PATCH 03/24] Support test_imperative using_non_zero_gpu with
 _test_eager_guard() (#38881)

* Support test_imperative using_non_zero_gpu and Add a TODO comment

* Change GPU number to 0

* Modify the cuda device selection method
---
 .../unittests/test_imperative_numpy_bridge.py   |  1 +
 .../test_imperative_using_non_zero_gpu.py       | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index 4f3089baffdd3..7b8d31ff030e5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -42,6 +42,7 @@ def func_tensor_from_numpy(self):
             self.assertEqual(data_np[0][0], -1)
             if _in_eager_mode():
                 # eager_mode, var2 is EagerTensor, is not subscriptable
+                # TODO(wuweilong): to support slice in eager mode later
                 self.assertNotEqual(var2.numpy()[0][0], -1)
             else:
                 self.assertNotEqual(var2[0][0].numpy()[0], -1)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index f2dfaef397797..46a89efcec491 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import unittest
-from paddle.fluid.dygraph import to_variable, Embedding, guard
+from paddle.fluid.dygraph import to_variable, guard
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
@@ -24,12 +26,21 @@ def run_main(self, np_arr, place):
             var = to_variable(np_arr)
             self.assertTrue(np.array_equal(np_arr, var.numpy()))
 
-    def test_non_zero_gpu(self):
+    def func_non_zero_gpu(self):
         if not fluid.is_compiled_with_cuda():
             return
 
         np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(0))
+        if paddle.device.cuda.device_count() > 1:
+            # should use non zero gpu if there are more than 1 gpu
+            self.run_main(np_arr, fluid.CUDAPlace(1))
+        else:
+            self.run_main(np_arr, fluid.CUDAPlace(0))
+
+    def test_non_zero_gpu(self):
+        with _test_eager_guard():
+            self.func_non_zero_gpu()
+        self.func_non_zero_gpu()
 
 
 if __name__ == '__main__':

From 281644cd0734d99151b08f8e221c2fd58a326249 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 13 Jan 2022 11:15:49 +0800
Subject: [PATCH 04/24] Fix mkldnn invalid infershape impl (#38837)

* fix mkldnn invalid infershape

* add unittest for mkldnn in new executor

* add import os
---
 .../fluid/eager/legacy/infer_shape_context.h  | 19 ++++++++++++++-----
 .../fluid/eager/legacy/prepared_operator.cc   |  2 +-
 .../new_executor/new_executor_defs.cc         | 11 +++++++++++
 .../new_executor/new_executor_defs.h          |  2 ++
 paddle/fluid/framework/op_desc.cc             |  4 ++++
 paddle/fluid/framework/operator.cc            | 15 ++++++++++++---
 paddle/fluid/framework/operator.h             |  7 ++-----
 paddle/fluid/framework/shape_inference.h      |  2 ++
 paddle/fluid/imperative/infer_shape_context.h | 19 +++++++++++++------
 paddle/fluid/imperative/prepared_operator.cc  |  8 ++++----
 paddle/fluid/operators/batch_norm_op.cc       |  6 +++---
 paddle/fluid/operators/conv_op.cc             |  2 +-
 paddle/fluid/operators/conv_transpose_op.cc   |  4 ++--
 paddle/fluid/operators/inplace_abn_op.cc      |  8 ++++----
 paddle/fluid/operators/pool_op.cc             |  2 +-
 .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 10 ++++++++++
 16 files changed, 86 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/eager/legacy/infer_shape_context.h b/paddle/fluid/eager/legacy/infer_shape_context.h
index 7a05f6a9b3581..a1032fd404f85 100644
--- a/paddle/fluid/eager/legacy/infer_shape_context.h
+++ b/paddle/fluid/eager/legacy/infer_shape_context.h
@@ -31,15 +31,18 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
   using DDim = paddle::framework::DDim;
 
  public:
-  EagerInferShapeContext(const NameTensorMap* in, const NameTensorMap* out,
-                         const paddle::framework::AttributeMap* attr,
-                         const paddle::framework::AttributeMap* default_attr,
-                         const std::string op_type)
+  EagerInferShapeContext(
+      const NameTensorMap* in, const NameTensorMap* out,
+      const paddle::framework::AttributeMap* attr,
+      const paddle::framework::AttributeMap* default_attr,
+      const std::string op_type,
+      const paddle::framework::OpKernelType* op_kernel_type = nullptr)
       : tensor_in_(in),
         tensor_out_(out),
         attrs_(attr),
         default_attrs_(default_attr),
-        op_type_(op_type) {}
+        op_type_(op_type),
+        op_kernel_type_(op_kernel_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -214,6 +217,11 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    return (op_kernel_type_ && (op_kernel_type_->data_layout_ ==
+                                paddle::framework::DataLayout::kMKLDNN));
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<paddle::framework::InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -400,6 +408,7 @@ class EagerInferShapeContext : public paddle::framework::InferShapeContext {
   const paddle::framework::AttributeMap* attrs_;
   const paddle::framework::AttributeMap* default_attrs_;
   const std::string op_type_;
+  const paddle::framework::OpKernelType* op_kernel_type_;
 };
 
 }  // namespace legacy
diff --git a/paddle/fluid/eager/legacy/prepared_operator.cc b/paddle/fluid/eager/legacy/prepared_operator.cc
index 4e892b14a9c9c..fbf2d678740ab 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.cc
+++ b/paddle/fluid/eager/legacy/prepared_operator.cc
@@ -173,7 +173,7 @@ static void PreparedOpRunImpl(
   paddle::framework::Scope scope;
 
   EagerInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, &default_attrs,
-                                         op.Type());
+                                         op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   func(EagerExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, attrs,
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 4b9404fd178fd..654746794da4e 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -307,6 +307,17 @@ void InterpretercoreInferShapeContext::SetLoDLevel(const std::string& out,
 
 bool InterpretercoreInferShapeContext::IsRuntime() const { return true; }
 
+bool InterpretercoreInferShapeContext::IsRunMKLDNNKernel() const {
+  try {
+    auto& op_with_kernel = dynamic_cast<const OperatorWithKernel&>(op_);
+    return ((op_with_kernel.kernel_type()) &&
+            (op_with_kernel.kernel_type()->data_layout_ ==
+             framework::DataLayout::kMKLDNN));
+  } catch (std::bad_cast exp) {
+    return false;
+  }
+}
+
 // TODO(paddle-dev): Can this be template?
 std::vector<InferShapeVarPtr> InterpretercoreInferShapeContext::GetInputVarPtrs(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index ca49e7f5670d6..5d63eb33d424b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -84,6 +84,8 @@ class InterpretercoreInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override;
 
+  bool IsRunMKLDNNKernel() const override;
+
   // TODO(paddle-dev): Can this be template?
   std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override;
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 4254ec236d473..7bceeb05bac59 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -240,6 +240,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override;
 
+  bool IsRunMKLDNNKernel() const override;
+
   std::vector<proto::VarType::Type> GetInputsVarType(
       const std::string &name) const override {
     return GetVarTypes(Inputs(name));
@@ -930,6 +932,8 @@ void CompileTimeInferShapeContext::SetRepeatedDims(
 
 bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
 
+bool CompileTimeInferShapeContext::IsRunMKLDNNKernel() const { return false; }
+
 proto::VarType::Type CompileTimeInferShapeContext::GetVarType(
     const std::string &name) const {
   return block_.FindVarRecursive(name)->GetType();
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dc4d1365093aa..93349b8b88449 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -884,6 +884,17 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    try {
+      auto& op_with_kernel = dynamic_cast<const OperatorWithKernel&>(op_);
+      return ((op_with_kernel.kernel_type()) &&
+              (op_with_kernel.kernel_type()->data_layout_ ==
+               framework::DataLayout::kMKLDNN));
+    } catch (std::bad_cast exp) {
+      return false;
+    }
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -1178,9 +1189,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("infer_shape",
                                        platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
-    // TODO(chenweihang): replace this after removing `this->IsMKLDNNType()`
-    // in some mkldnn infershape functions, such conv2d infershape
-    this->InferShape(&infer_shape_ctx);
+    this->Info().infer_shape_(&infer_shape_ctx);
   }
 
   if (FLAGS_enable_unused_var_check) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 09e4abc77f573..8e69f96dfb813 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -528,11 +528,6 @@ class OperatorWithKernel : public OperatorBase {
     return g_all_op_kernels;
   }
 
-  bool IsMKLDNNType() const {
-    return ((this->kernel_type_) && (this->kernel_type_->data_layout_ ==
-                                     framework::DataLayout::kMKLDNN));
-  }
-
   bool SupportGPU() const override {
     auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
     return std::any_of(op_kernels.begin(), op_kernels.end(),
@@ -609,6 +604,8 @@ class OperatorWithKernel : public OperatorBase {
     return pt_kernel_context_.get();
   }
 
+  const OpKernelType* kernel_type() const { return kernel_type_.get(); }
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 10b0fa6afd78a..791600b39c3d9 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -102,6 +102,8 @@ class InferShapeContext {
 
   virtual bool IsRuntime() const = 0;
 
+  virtual bool IsRunMKLDNNKernel() const = 0;
+
   virtual std::vector<InferShapeVarPtr> GetInputVarPtrs(
       const std::string &name) const = 0;
   virtual std::vector<InferShapeVarPtr> GetOutputVarPtrs(
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index 167d5682cbfdb..a16ad1688fbac 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -32,16 +32,17 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   using DDim = framework::DDim;
 
  public:
-  DygraphInferShapeContext(const NameVarMap<VarType>* in,
-                           const NameVarMap<VarType>* out,
-                           const framework::AttributeMap* attr,
-                           const framework::AttributeMap* default_attr,
-                           const std::string op_type)
+  DygraphInferShapeContext(
+      const NameVarMap<VarType>* in, const NameVarMap<VarType>* out,
+      const framework::AttributeMap* attr,
+      const framework::AttributeMap* default_attr, const std::string op_type,
+      const framework::OpKernelType* op_kernel_type = nullptr)
       : var_base_map_in_(in),
         var_base_map_out_(out),
         attrs_(attr),
         default_attrs_(default_attr),
-        op_type_(op_type) {}
+        op_type_(op_type),
+        op_kernel_type_(op_kernel_type) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
@@ -214,6 +215,11 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
   bool IsRuntime() const override { return true; }
 
+  bool IsRunMKLDNNKernel() const override {
+    return (op_kernel_type_ &&
+            (op_kernel_type_->data_layout_ == framework::DataLayout::kMKLDNN));
+  }
+
   // TODO(paddle-dev): Can this be template?
   std::vector<framework::InferShapeVarPtr> GetInputVarPtrs(
       const std::string& name) const override {
@@ -399,6 +405,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
   const framework::AttributeMap* attrs_;
   const framework::AttributeMap* default_attrs_;
   const std::string op_type_;
+  const framework::OpKernelType* op_kernel_type_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 1d12ecf30ede5..46e974c8f43f3 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -514,8 +514,8 @@ static void PreparedOpRunImpl(
   // TODO(zjl): remove scope in dygraph
   framework::Scope scope;
 
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
-                                                    &default_attrs, op.Type());
+  DygraphInferShapeContext<VarType> infer_shape_ctx(
+      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
@@ -560,8 +560,8 @@ static void PreparedOpRunPtImpl(
     platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
-  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
-                                                    &default_attrs, op.Type());
+  DygraphInferShapeContext<VarType> infer_shape_ctx(
+      &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index bc5bd118dbec4..0a8e753c01dc0 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -93,7 +93,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
           x_dims, x_dims.size()));
 
   const int64_t C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
@@ -508,7 +508,7 @@ void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
       ctx->Attrs().Get<std::string>("data_layout"));
 
   const int C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
@@ -911,7 +911,7 @@ void BatchNormDoubleGradOp::InferShape(
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
   const int C =
-      ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
+      ((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
            ? x_dims[1]
            : x_dims[x_dims.size() - 1]);
 
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 41f6f75200697..e500814232aae 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -57,7 +57,7 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (this->IsMKLDNNType() == false) &&
+  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
                             (data_format == "NHWC" || data_format == "NDHWC");
 
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index d60786f60e9cc..12f537e2f7980 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -49,8 +49,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   const std::string data_layout_str =
       ctx->Attrs().Get<std::string>("data_format");
   const DataLayout data_layout =
-      this->IsMKLDNNType() ? DataLayout::kNCHW
-                           : framework::StringToDataLayout(data_layout_str);
+      ctx->IsRunMKLDNNKernel() ? DataLayout::kNCHW
+                               : framework::StringToDataLayout(data_layout_str);
 
   PADDLE_ENFORCE_EQ(in_dims.size() == 4 || in_dims.size() == 5, true,
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 8234d63d681ff..7a112292c8fc5 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -100,10 +100,10 @@ class InplaceABNGradOp : public paddle::operators::BatchNormGradOp {
     const DataLayout data_layout = framework::StringToDataLayout(
         ctx->Attrs().Get<std::string>("data_layout"));
 
-    const int C =
-        ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW)
-             ? y_dims[1]
-             : y_dims[y_dims.size() - 1]);
+    const int C = ((ctx->IsRunMKLDNNKernel() == true) ||
+                           (data_layout == DataLayout::kNCHW)
+                       ? y_dims[1]
+                       : y_dims[y_dims.size() - 1]);
 
     ctx->SetOutputDim(framework::GradVarName("X"), y_dims);
     // has_scale_grad == has_bias_grad, judge has_scale_grad is enough
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index fa98e76e39338..b4ba80ae7ae2f 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -97,7 +97,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
-  const bool channel_last = (this->IsMKLDNNType() == false) &&
+  const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) &&
                             (data_format == "NHWC" || data_format == "NDHWC");
 
   // update paddings if "SAME" or global_pooling
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 50d53864789f3..487a69807e2b0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import numpy as np
 
@@ -232,6 +233,15 @@ def init_group(self):
         self.groups = 3
 
 
+# TODO(chenweihang): To solve the coverage problem, add this unittest,
+# remove this unittest after new executor set to default executor
+class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp):
+    def test_check_output_by_new_executor(self):
+        os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
+        self.test_check_output()
+        del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
+
+
 if __name__ == '__main__':
     from paddle import enable_static
     enable_static()

From fc6eed5b2789d5cdb5c84bf2fb9e41db2bcfdc5d Mon Sep 17 00:00:00 2001
From: jakpiase <jakpia21@gmail.com>
Date: Thu, 13 Jan 2022 04:43:45 +0100
Subject: [PATCH 05/24] Added mul BF16/FP32 FWD/BWD oneDNN kernel (#38552)

* base changes for mul reimplementation

* empty commit

* tmp save

* full implementation of mul bf16/fp32 fwd bwd

* CI fix

* CI rerun

* changed unity build cmake to avoid gpu issues

* removed mul mkldnn from unity build

* added skipping tests if not cpu_bf16

* CI fix

* CI fix

* CI fix
---
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   | 109 +----------
 .../fluid/operators/mkldnn/mul_mkldnn_op.cc   | 176 +++++++++++++++++-
 paddle/fluid/operators/mul_op.cc              |  36 ++++
 paddle/fluid/operators/mul_op.h               |   1 +
 paddle/fluid/operators/unity_build_rule.cmake |   1 -
 paddle/fluid/platform/mkldnn_reuse.h          | 108 +++++++++++
 .../contrib/mixed_precision/bf16/amp_lists.py |   2 +-
 .../fluid/tests/book/test_fit_a_line.py       |  13 ++
 .../mkldnn/test_mul_int8_mkldnn_op.py         |   2 +
 .../unittests/mkldnn/test_mul_mkldnn_op.py    | 159 ++++++++++++++++
 10 files changed, 490 insertions(+), 117 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py

diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index a8d4b852ca3c2..d3c7c1759641b 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -20,6 +20,7 @@ using dnnl::memory;
 using dnnl::primitive;
 using paddle::framework::DataLayout;
 using paddle::framework::ExecutionContext;
+using paddle::platform::MatMulV2MKLDNNHandler;
 using paddle::platform::GetMKLDNNFormat;
 using paddle::platform::MKLDNNDeviceContext;
 using paddle::platform::MKLDNNGetDataType;
@@ -107,114 +108,6 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
   return strides;
 }
 
-template <typename T>
-class MatMulV2MKLDNNHandler
-    : public paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul> {
- public:
-  MatMulV2MKLDNNHandler(const dnnl::engine engine,
-                        paddle::platform::Place cpu_place,
-                        const std::vector<int64_t>& x_org_dims, bool trans_x,
-                        const std::vector<int64_t>& y_org_dims, bool trans_y,
-                        bool is_output_fused,
-                        const std::vector<int64_t>& x_strides_override,
-                        const std::vector<int64_t>& y_strides_override)
-      : paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul>(engine,
-                                                                   cpu_place) {
-    // M X K * K X N
-    std::vector<int64_t> x_dims(x_org_dims);
-    std::vector<int64_t> y_dims(y_org_dims);
-
-    const int MB_idx = x_dims.size() - 3;
-    const int H_idx = x_dims.size() - 2;
-    const int W_idx = x_dims.size() - 1;
-
-    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
-    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
-
-    const memory::dim M = x_dims[H_idx];
-    const memory::dim K = x_dims[W_idx];
-    const memory::dim N = y_dims[W_idx];
-
-    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
-    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
-
-    x_strides.reserve(x_dims.size());
-    y_strides.reserve(x_dims.size());
-    out_strides.reserve(x_dims.size());
-
-    if (!x_strides_override.empty()) {
-      x_strides = x_strides_override;
-    } else {
-      if (!trans_x) {
-        x_strides.insert(x_strides.end(), {M * K, K, 1});
-      } else {
-        x_strides.insert(x_strides.end(), {M * K, 1, M});
-      }
-    }
-
-    if (!y_strides_override.empty()) {
-      y_strides = y_strides_override;
-    } else {
-      if (!trans_y) {
-        y_strides.insert(y_strides.end(), {N * K, N, 1});
-      } else {
-        y_strides.insert(y_strides.end(), {N * K, 1, K});
-      }
-    }
-
-    out_strides.insert(out_strides.end(), {M * N, N, 1});
-    out_ddims.insert(out_ddims.end(),
-                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
-
-    for (int i = x_dims.size() - 4; i >= 0; --i) {
-      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
-      if (x_strides_override.empty()) {
-        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
-      }
-      if (y_strides_override.empty()) {
-        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
-      }
-      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
-    }
-
-    if (is_output_fused) {
-      out_strides = FakeTransposeStrides(out_ddims);
-    }
-
-    auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
-    auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
-    auto out_md = memory::desc(out_ddims, MKLDNNGetDataType<T>(), out_strides);
-
-    this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
-  }
-
-  std::vector<int64_t> FakeTransposeStrides(
-      const std::vector<int64_t>& matmul_out_dims) const {
-    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
-    // transpose axis are: {0, 2, 1, 3}
-    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
-    std::vector<int64_t> fake_strides(transpose_axis.size());
-    int ndims = static_cast<int>(transpose_axis.size());
-
-    int total_stride = 1;
-
-    for (int i = ndims - 1; i >= 0; --i) {
-      fake_strides[transpose_axis[i]] = total_stride;
-      total_stride *= matmul_out_dims[transpose_axis[i]];
-    }
-
-    return fake_strides;
-  }
-
-  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
-                                            to_void_cast<T>(input_data));
-  }
-};
-
 bool IsOutputFused(const ExecutionContext& ctx) {
   auto& fused_reshape_Out = ctx.Attr<std::vector<int>>("fused_reshape_Out");
   auto& fused_transpose_Out = ctx.Attr<std::vector<int>>("fused_transpose_Out");
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index 0938024052271..49c896ef80fcc 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/mul_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace framework {
@@ -32,13 +32,17 @@ namespace operators {
 using framework::DataLayout;
 using framework::DDim;
 using framework::ExecutionContext;
+using framework::LoDTensor;
 using framework::Tensor;
+
+using platform::MatMulV2MKLDNNHandler;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
+
 using dnnl::inner_product_forward;
 using dnnl::memory;
 using dnnl::prop_kind;
 using dnnl::stream;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
 
 template <typename XT, typename YT, typename OT>
 class MulPrimitiveFactory {
@@ -345,7 +349,7 @@ inner_product_forward GetMulPrimitive(const MKLDNNDeviceContext &dev_ctx,
 
 /* XT: input x data type, YT: input y data type */
 template <typename XT, typename YT>
-class MulMKLDNNKernel : public framework::OpKernel<XT> {
+class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
  public:
   void Compute(const ExecutionContext &ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
@@ -371,17 +375,175 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
   }
 };
 
+template <typename XT, typename YT>
+class MulMKLDNNKernel : public framework::OpKernel<XT> {
+ public:
+  void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); }
+
+ protected:
+  void ExecuteMatMul(const ExecutionContext &ctx,
+                     const MKLDNNDeviceContext &dev_ctx,
+                     const dnnl::engine &onednn_engine,
+                     const platform::Place &cpu_place, const Tensor *x,
+                     const std::vector<int64_t> &x_dims, bool trans_x,
+                     const Tensor *y, const std::vector<int64_t> &y_dims,
+                     bool trans_y, Tensor *out) const {
+    static const std::vector<int64_t> vec_placeholder;
+    MatMulV2MKLDNNHandler<XT> handler(onednn_engine, ctx.GetPlace(), x_dims,
+                                      trans_x, y_dims, trans_y, false,
+                                      vec_placeholder, vec_placeholder);
+
+    const auto src_memory_p = handler.AcquireSrcMemory(x);
+    const auto weights_memory_p = handler.AcquireWeightsMemory(y);
+    const auto dst_memory_p = handler.AcquireDstMemory(out);
+
+    auto matmul_p = handler.AcquireForwardPrimitive();
+
+    std::unordered_map<int, dnnl::memory> matmul_args = {
+        {DNNL_ARG_SRC, *src_memory_p},
+        {DNNL_ARG_WEIGHTS, *weights_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto &astream = MKLDNNDeviceContext::tls().get_stream();
+    matmul_p->execute(astream, matmul_args);
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    // plain output formats are enforced inside handler
+    out->set_format(platform::MKLDNNFormatForSize(
+        out->dims().size(), dnnl::memory::format_tag::nchw));
+  }
+
+ private:
+  void RunKernel(const ExecutionContext &ctx) const {
+    const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto &onednn_engine = dev_ctx.GetEngine();
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *y = ctx.Input<Tensor>("Y");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
+
+    // adding mb dim because MatMulV2 handler needs it
+    std::vector<int64_t> y_dims(3, 1);
+    std::vector<int64_t> x_dims(3, 1);
+
+    y_dims[1] = y_matrix.dims()[0];
+    y_dims[2] = y_matrix.dims()[1];
+
+    x_dims[1] = x_matrix.dims()[0];
+    x_dims[2] = x_matrix.dims()[1];
+
+    ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(), &x_matrix,
+                  x_dims, false, &y_matrix, y_dims, false, out);
+  }
+};
+
+template <typename XT, typename YT>
+class MulGradMKLDNNKernel : public MulMKLDNNKernel<XT, YT> {
+ public:
+  void Compute(const ExecutionContext &ctx) const override { RunKernel(ctx); }
+
+ private:
+  template <typename OT = XT>
+  void RunKernel(const ExecutionContext &ctx) const {
+    const auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto &onednn_engine = dev_ctx.GetEngine();
+
+    const auto *x = ctx.Input<LoDTensor>("X");
+    const auto *y = ctx.Input<LoDTensor>("Y");
+    const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto *dx = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *dy = ctx.Output<LoDTensor>(framework::GradVarName("Y"));
+
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : static_cast<const Tensor &>(*x);
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : static_cast<const Tensor &>(*y);
+
+    Tensor dout_matrix = *dout;
+    dout_matrix.Resize(
+        {framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+
+    // adding mb dim because MatMulV2 handler needs it
+    std::vector<int64_t> x_dims(3, 1);
+    std::vector<int64_t> y_dims(3, 1);
+    std::vector<int64_t> dout_dims(3, 1);
+
+    x_dims[1] = x_matrix.dims()[0];
+    x_dims[2] = x_matrix.dims()[1];
+
+    y_dims[1] = y_matrix.dims()[0];
+    y_dims[2] = y_matrix.dims()[1];
+
+    dout_dims[1] = dout_matrix.dims()[0];
+    dout_dims[2] = dout_matrix.dims()[1];
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(),
+                          &dout_matrix, dout_dims, false, &y_matrix, y_dims,
+                          true, static_cast<Tensor *>(dx));
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+      this->ExecuteMatMul(ctx, dev_ctx, onednn_engine, ctx.GetPlace(),
+                          &x_matrix, x_dims, true, &dout_matrix, dout_dims,
+                          false, static_cast<Tensor *>(dy));
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
                                     U8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<uint8_t, float>);
+                                    ops::MulMKLDNNINT8Kernel<uint8_t, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
                                     S8, ops::kMULMKLDNNINT8,
-                                    ops::MulMKLDNNKernel<int8_t, float>);
+                                    ops::MulMKLDNNINT8Kernel<int8_t, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul, MKLDNN, ::paddle::platform::CPUPlace,
+                                    FP32, ops::kMULMKLDNNFP32,
+                                    ops::MulMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    mul, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32,
+    ops::MulMKLDNNKernel<paddle::platform::bfloat16,
+                         paddle::platform::bfloat16>);
 
 REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::MulMKLDNNKernel<uint8_t, float>);
+                   ops::MulMKLDNNINT8Kernel<uint8_t, float>,
+                   ops::MulMKLDNNKernel<paddle::platform::bfloat16,
+                                        paddle::platform::bfloat16>,
+                   ops::MulMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(mul_grad, MKLDNN,
+                                    ::paddle::platform::CPUPlace, FP32,
+                                    ops::kMULMKLDNNFP32,
+                                    ops::MulGradMKLDNNKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    mul_grad, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kMULMKLDNNFP32,
+    ops::MulGradMKLDNNKernel<paddle::platform::bfloat16,
+                             paddle::platform::bfloat16>,
+    ops::MulGradMKLDNNKernel<float, float>);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 14291f8458430..691c394870ad4 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -113,6 +113,12 @@ class MulOp : public framework::OperatorWithKernel {
       if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
           input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
         customized_type_value = kMULMKLDNNINT8;
+      } else if (input_data_type ==
+                     framework::DataTypeTrait<
+                         paddle::platform::bfloat16>::DataType() ||
+                 input_data_type ==
+                     framework::DataTypeTrait<float>::DataType()) {
+        customized_type_value = kMULMKLDNNFP32;
       }
     }
 #endif
@@ -233,6 +239,36 @@ class MulGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+
+      if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
+          input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
+        customized_type_value = kMULMKLDNNINT8;
+      } else if (input_data_type ==
+                     framework::DataTypeTrait<
+                         paddle::platform::bfloat16>::DataType() ||
+                 input_data_type ==
+                     framework::DataTypeTrait<float>::DataType()) {
+        customized_type_value = kMULMKLDNNFP32;
+      }
+    }
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library, customized_type_value);
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 3a13e0576e347..0fb32cf4be886 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -25,6 +25,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 constexpr int kMULMKLDNNINT8 = 1;
+constexpr int kMULMKLDNNFP32 = 2;
 
 template <typename DeviceContext, typename T>
 class MulKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 25aef67425ef9..5ab2004617810 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -192,7 +192,6 @@ register_unity_group(cc
     pad_op.cc)
 register_unity_group(cc
     modified_huber_loss_op.cc
-    mkldnn/mul_mkldnn_op.cc
     partial_sum_op.cc
     pixel_shuffle_op.cc
     pool_op.cc
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index c16137b50dbf7..ef216e48416f9 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -772,6 +772,114 @@ class ReductionMKLDNNHandler
   }
 };
 
+template <typename T>
+class MatMulV2MKLDNNHandler
+    : public paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul> {
+ public:
+  MatMulV2MKLDNNHandler(const dnnl::engine engine,
+                        paddle::platform::Place cpu_place,
+                        const std::vector<int64_t>& x_org_dims, bool trans_x,
+                        const std::vector<int64_t>& y_org_dims, bool trans_y,
+                        bool is_output_fused,
+                        const std::vector<int64_t>& x_strides_override,
+                        const std::vector<int64_t>& y_strides_override)
+      : paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::matmul>(engine,
+                                                                   cpu_place) {
+    // M X K * K X N
+    std::vector<int64_t> x_dims(x_org_dims);
+    std::vector<int64_t> y_dims(y_org_dims);
+
+    const int MB_idx = x_dims.size() - 3;
+    const int H_idx = x_dims.size() - 2;
+    const int W_idx = x_dims.size() - 1;
+
+    if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]);
+    if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]);
+
+    const memory::dim M = x_dims[H_idx];
+    const memory::dim K = x_dims[W_idx];
+    const memory::dim N = y_dims[W_idx];
+
+    std::vector<int64_t> x_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> y_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_strides(x_dims.size() - 3, 1);
+    std::vector<int64_t> out_ddims(x_dims.size() - 3, 1);
+
+    x_strides.reserve(x_dims.size());
+    y_strides.reserve(x_dims.size());
+    out_strides.reserve(x_dims.size());
+
+    if (!x_strides_override.empty()) {
+      x_strides = x_strides_override;
+    } else {
+      if (!trans_x) {
+        x_strides.insert(x_strides.end(), {M * K, K, 1});
+      } else {
+        x_strides.insert(x_strides.end(), {M * K, 1, M});
+      }
+    }
+
+    if (!y_strides_override.empty()) {
+      y_strides = y_strides_override;
+    } else {
+      if (!trans_y) {
+        y_strides.insert(y_strides.end(), {N * K, N, 1});
+      } else {
+        y_strides.insert(y_strides.end(), {N * K, 1, K});
+      }
+    }
+
+    out_strides.insert(out_strides.end(), {M * N, N, 1});
+    out_ddims.insert(out_ddims.end(),
+                     {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N});
+
+    for (int i = x_dims.size() - 4; i >= 0; --i) {
+      out_ddims[i] = std::max(x_dims[i], y_dims[i]);
+      if (x_strides_override.empty()) {
+        x_strides[i] = x_dims[i + 1] * x_strides[i + 1];
+      }
+      if (y_strides_override.empty()) {
+        y_strides[i] = y_dims[i + 1] * y_strides[i + 1];
+      }
+      out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
+    }
+
+    if (is_output_fused) {
+      out_strides = FakeTransposeStrides(out_ddims);
+    }
+
+    auto x_md = memory::desc(x_dims, MKLDNNGetDataType<T>(), x_strides);
+    auto y_md = memory::desc(y_dims, MKLDNNGetDataType<T>(), y_strides);
+    auto out_md = memory::desc(out_ddims, MKLDNNGetDataType<T>(), out_strides);
+
+    this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md);
+  }
+
+  std::vector<int64_t> FakeTransposeStrides(
+      const std::vector<int64_t>& matmul_out_dims) const {
+    // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and
+    // transpose axis are: {0, 2, 1, 3}
+    std::vector<int64_t> transpose_axis = {0, 2, 1, 3};
+    std::vector<int64_t> fake_strides(transpose_axis.size());
+    int ndims = static_cast<int>(transpose_axis.size());
+
+    int total_stride = 1;
+
+    for (int i = ndims - 1; i >= 0; --i) {
+      fake_strides[transpose_axis[i]] = total_stride;
+      total_stride *= matmul_out_dims[transpose_axis[i]];
+    }
+
+    return fake_strides;
+  }
+
+  std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+};
+
 template <typename T>
 class ActivationMKLDNNHandler
     : public MKLDNNHandlerNoCachingT<T, dnnl::eltwise_forward,
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
index 3a4dc8ed9afcc..bbabbaa007309 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
@@ -83,7 +83,7 @@ def _update_list(self):
 bf16_initializer_list = {'fill_constant', 'uniform_random'}
 
 # always bf16
-bf16_list = {'elementwise_add', }
+bf16_list = {'elementwise_add', 'mul'}
 
 # depends on the prev_op type
 gray_list = {
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index a8a5c8bf31598..4324e582fc991 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -37,6 +37,15 @@ def convert_uint16_to_float(in_list):
     return numpy.reshape(out, in_list.shape)
 
 
+def convert_float_to_uint16(in_list):
+    out = []
+    for x in numpy.nditer(in_list):
+        out.append(
+            numpy.uint16(struct.unpack('<I', struct.pack('<f', x))[0] >> 16))
+    out = numpy.reshape(out, in_list.shape).view(numpy.uint16)
+    return out
+
+
 def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
@@ -158,6 +167,10 @@ def infer(use_cuda, save_dirname=None, use_bf16=False):
         test_data = next(test_reader())
         test_feat = numpy.array(
             [data[0] for data in test_data]).astype("float32")
+
+        if use_bf16:
+            test_feat = convert_float_to_uint16(test_feat)
+
         test_label = numpy.array(
             [data[1] for data in test_data]).astype("float32")
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
index 0c91868d30245..9265d5f7edfbb 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 '''
@@ -159,4 +160,5 @@ def init_data_type(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
new file mode 100644
index 0000000000000..a0581d791209d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_mul_mkldnn_op.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from numpy.matrixlib import defmatrix
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, OpTestTool
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestMulOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.attrs = {'use_mkldnn': True}
+        self.init_shapes_and_attrs()
+
+        self.x_fp32 = np.random.random(self.x_shape).astype(np.float32)
+        self.y_fp32 = np.random.random(self.y_shape).astype(np.float32)
+
+        self.x = self.x_fp32
+        self.y = self.y_fp32
+
+        self.init_inputs_dtype()
+
+        self.inputs = {'X': self.x, 'Y': self.y}
+
+        output = np.dot(
+            np.reshape(self.x_fp32, self.np_x_shape),
+            np.reshape(self.y_fp32, self.np_y_shape))
+        self.outputs = {'Out': np.reshape(output, self.out_shape)}
+
+    def init_shapes_and_attrs(self):
+        self.x_shape = (20, 5)
+        self.y_shape = (5, 21)
+
+        self.np_x_shape = (20, 5)
+        self.np_y_shape = (5, 21)
+
+        self.out_shape = (20, 21)
+
+    def init_inputs_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(core.CPUPlace(), ['Y'], 'Out', set('X'))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(core.CPUPlace(), ['X'], 'Out', set('Y'))
+
+
+class TestMulXNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (6, 7, 5)
+        self.y_shape = (5, 21)
+
+        self.np_x_shape = (42, 5)
+        self.np_y_shape = (5, 21)
+
+        self.out_shape = (6, 7, 21)
+
+        self.attrs["x_num_col_dims"] = 2
+
+
+class TestMulYNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (20, 6)
+        self.y_shape = (2, 3, 21)
+
+        self.np_x_shape = (20, 6)
+        self.np_y_shape = (6, 21)
+
+        self.out_shape = (20, 21)
+
+        self.attrs["y_num_col_dims"] = 2
+
+
+class TestMulYAndXNumColDims2OneDNNOp(TestMulOneDNNOp):
+    def init_shapes_and_attrs(self):
+        self.x_shape = (10, 5, 6)
+        self.y_shape = (2, 3, 21)
+
+        self.np_x_shape = (50, 6)
+        self.np_y_shape = (6, 21)
+
+        self.out_shape = (10, 5, 21)
+
+        self.attrs["x_num_col_dims"] = 2
+        self.attrs["y_num_col_dims"] = 2
+
+
+class TestMulBF16OneDNNOp(TestMulOneDNNOp):
+    def init_inputs_dtype(self):
+        self.x = convert_float_to_uint16(self.x)
+        self.y = convert_float_to_uint16(self.y)
+
+    def calculate_grads(self):
+        x_np = np.reshape(self.x_fp32, self.np_x_shape)
+        y_np = np.reshape(self.y_fp32, self.np_y_shape)
+
+        self.dout = self.outputs['Out']
+        self.dout_np = np.reshape(self.dout, (x_np.shape[0], y_np.shape[1]))
+
+        y_np_trans = np.transpose(y_np, (1, 0))
+        x_np_trans = np.transpose(x_np, (1, 0))
+
+        self.dx = np.matmul(self.dout_np, y_np_trans)
+        self.dy = np.matmul(x_np_trans, self.dout_np)
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['X', 'Y'],
+            'Out',
+            user_defined_grads=[self.dx, self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def test_check_grad_ingore_x(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['Y'],
+            'Out',
+            set('X'),
+            user_defined_grads=[self.dy],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+    def test_check_grad_ingore_y(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ['X'],
+            'Out',
+            set('Y'),
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()

From 08dcea18edaf19ef1eeea1a8905e28d6f318d211 Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 13 Jan 2022 14:00:27 +0800
Subject: [PATCH 06/24] roi_align aligned supported (#38905)

roi_align aligned supported
---
 .../tensorrt/convert/roi_align_op.cc          |  4 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  | 30 ---------
 .../tensorrt/plugin/roi_align_op_plugin.cu    | 64 +++++++++++--------
 .../tensorrt/plugin/roi_align_op_plugin.h     |  4 +-
 .../inference/test_trt_convert_roi_align.py   | 10 ---
 5 files changed, 45 insertions(+), 67 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 654fe7e013379..54f7937d83747 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -51,6 +51,7 @@ class RoiAlignOpConverter : public OpConverter {
         BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale"));
     const auto sampling_ratio =
         BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio"));
+    const auto aligned = BOOST_GET_CONST(bool, op_desc.GetAttr("aligned"));
 
     const auto input_tensor = engine_->GetITensor(input_name);
     const auto rois_tensor = engine_->GetITensor(rois_name);
@@ -63,7 +64,8 @@ class RoiAlignOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
 
     auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
-        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
+        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio,
+        aligned);
     auto roi_align_layer = engine_->network()->addPluginV2(
         inputs.data(), inputs.size(), *roi_align_plugin);
     layer = roi_align_layer;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 878eef016e7d1..ddee4e0d682b0 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,9 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
-
 #include <bitset>
-
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -737,28 +735,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
-    if (op_type == "roi_align") {
-      if (!with_dynamic_shape) return false;
-
-      std::vector<std::string> attrs{"pooled_height", "pooled_width",
-                                     "spatial_scale", "sampling_ratio"};
-      for (auto const attr : attrs) {
-        if (!desc.HasAttr(attr)) return false;
-      }
-
-      const auto pooled_height =
-          BOOST_GET_CONST(int, desc.GetAttr("pooled_height"));
-      if (pooled_height <= 0) return false;
-
-      const auto pooled_width =
-          BOOST_GET_CONST(int, desc.GetAttr("pooled_width"));
-      if (pooled_width <= 0) return false;
-
-      const auto spatial_scale =
-          BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
-      if (spatial_scale <= 0.f) return false;
-    }
-
     if (op_type == "hard_swish") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "HardSwish op has only 1 input, but got "
@@ -1303,12 +1279,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
       if (spatial_scale <= 0.f) return false;
 
-      const auto sampling_ratio =
-          BOOST_GET_CONST(int, desc.GetAttr("sampling_ratio"));
-      const auto aligned = BOOST_GET_CONST(bool, desc.GetAttr("aligned"));
-
-      if (sampling_ratio == -1 && aligned == true) return false;
-
       auto roi_align_inputs = desc.Inputs();
       if (roi_align_inputs.find("RoisNum") != roi_align_inputs.end()) {
         if (desc.Input("RoisNum").size() >= 1) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 06540b3626082..7dc31fb44719a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -58,14 +58,12 @@ __inline__ __device__ T BilinearInterpolate(const T* input_data,
 }
 
 template <typename T, typename OutT, bool USE_SMEM>
-__global__ void GPUROIAlignOpt(const int nthreads,
-                               const T* __restrict__ input_data,
-                               const T* __restrict__ input_rois,
-                               const float spatial_scale, const int channels,
-                               const int height, const int width,
-                               const int pooled_height, const int pooled_width,
-                               const int sampling_ratio, const int num_rois,
-                               OutT* __restrict__ output_data) {
+__global__ void GPUROIAlignOpt(
+    const int nthreads, const T* __restrict__ input_data,
+    const T* __restrict__ input_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    const int num_rois, const bool aligned, OutT* __restrict__ output_data) {
   const int batch = blockIdx.x;
   const int channel = blockIdx.y;
   const T* offset_input_data =
@@ -84,21 +82,28 @@ __global__ void GPUROIAlignOpt(const int nthreads,
     const int roi_idx = (idx / pooled_width / pooled_height) % num_rois;
     const int n = batch * num_rois + roi_idx;
     const float4 rois_offset = reinterpret_cast<const float4*>(input_rois)[n];
-    const T roi_xmin = rois_offset.x * spatial_scale;
-    const T roi_ymin = rois_offset.y * spatial_scale;
-    const T roi_xmax = rois_offset.z * spatial_scale;
-    const T roi_ymax = rois_offset.w * spatial_scale;
-    const T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.f));
-    const T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.f));
-    const T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    const T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const T roi_offset = aligned ? static_cast<T>(0.5) : 0;
+    const T roi_xmin = rois_offset.x * spatial_scale - roi_offset;
+    const T roi_ymin = rois_offset.y * spatial_scale - roi_offset;
+    const T roi_xmax = rois_offset.z * spatial_scale - roi_offset;
+    const T roi_ymax = rois_offset.w * spatial_scale - roi_offset;
+
+    T roi_width = roi_xmax - roi_xmin;
+    T roi_height = roi_ymax - roi_ymin;
+    if (!aligned) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
+    const T bin_size_h =
+        static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    const T bin_size_w =
+        static_cast<T>(roi_width) / static_cast<T>(pooled_width);
     const int roi_bin_grid_h = (sampling_ratio > 0)
                                    ? sampling_ratio
                                    : ceil(roi_height / pooled_height);
     const int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
-
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
     T output_val = 0.f;
     for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
       const T y = roi_ymin + ph * bin_size_h +
@@ -132,12 +137,13 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
                                              const int pooled_height,
                                              const int pooled_width,
                                              float spatial_scale,
-                                             int sampling_ratio)
+                                             int sampling_ratio, bool aligned)
     : data_type_(data_type),
       pooled_height_(pooled_height),
       pooled_width_(pooled_width),
       spatial_scale_(spatial_scale),
-      sampling_ratio_(sampling_ratio) {
+      sampling_ratio_(sampling_ratio),
+      aligned_(aligned) {
   bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT ||
                             data_type_ == nvinfer1::DataType::kHALF;
   PADDLE_ENFORCE_EQ(data_type_is_valid, true,
@@ -187,6 +193,7 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
   DeserializeValue(&data, &length, &pooled_width_);
   DeserializeValue(&data, &length, &spatial_scale_);
   DeserializeValue(&data, &length, &sampling_ratio_);
+  DeserializeValue(&data, &length, &aligned_);
   int smem_per_block = -1;
   int device = -1;
   cudaGetDevice(&device);
@@ -204,7 +211,7 @@ nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const
     TRT_NOEXCEPT {
   auto* plugin =
       new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
-                                spatial_scale_, sampling_ratio_);
+                                spatial_scale_, sampling_ratio_, aligned_);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
 }
@@ -272,14 +279,15 @@ int RoiAlignPluginDynamic::enqueue_impl(
         output_size, static_cast<const T*>(inputs[0]),
         static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        static_cast<OutT*>(outputs[0]));
+        aligned_, static_cast<OutT*>(outputs[0]));
   } else {
     GPUROIAlignOpt<
-        T, OutT, true><<<blocks, threads, width * height * sizeof(T), stream>>>(
+        T, OutT,
+        false><<<blocks, threads, width * height * sizeof(T), stream>>>(
         output_size, static_cast<const T*>(inputs[0]),
         static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
         width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
-        static_cast<OutT*>(outputs[0]));
+        aligned_, static_cast<OutT*>(outputs[0]));
   }
 
   return cudaGetLastError() != cudaSuccess;
@@ -313,6 +321,10 @@ const char* RoiAlignPluginDynamic::getPluginType() const TRT_NOEXCEPT {
   return "roi_align_plugin_dynamic";
 }
 
+const char* RoiAlignPluginDynamic::getPluginVersion() const TRT_NOEXCEPT {
+  return "2";
+}
+
 int RoiAlignPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
 int RoiAlignPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
@@ -326,6 +338,7 @@ size_t RoiAlignPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   serialize_size += SerializedSize(pooled_width_);
   serialize_size += SerializedSize(spatial_scale_);
   serialize_size += SerializedSize(sampling_ratio_);
+  serialize_size += SerializedSize(aligned_);
   return serialize_size;
 }
 
@@ -335,6 +348,7 @@ void RoiAlignPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, pooled_width_);
   SerializeValue(&buffer, spatial_scale_);
   SerializeValue(&buffer, sampling_ratio_);
+  SerializeValue(&buffer, aligned_);
 }
 
 void RoiAlignPluginDynamic::destroy() TRT_NOEXCEPT {}
@@ -357,7 +371,7 @@ const char* RoiAlignPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
 
 const char* RoiAlignPluginDynamicCreator::getPluginVersion() const
     TRT_NOEXCEPT {
-  return "1";
+  return "2";
 }
 
 const nvinfer1::PluginFieldCollection*
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
index 44d2b63069835..9f4723da9e17b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -31,7 +31,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
                                  const int pooled_height,
                                  const int pooled_width, float spatial_scale,
-                                 int sampling_ratio);
+                                 int sampling_ratio, bool aligned);
   RoiAlignPluginDynamic(void const* data, size_t length);
   ~RoiAlignPluginDynamic() = default;
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
@@ -66,6 +66,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   size_t getSerializationSize() const TRT_NOEXCEPT override;
   void serialize(void* buffer) const TRT_NOEXCEPT override;
   void destroy() TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
 
  private:
   template <typename T, typename OutT>
@@ -80,6 +81,7 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
   float spatial_scale_;
   int sampling_ratio_;
   int smem_per_block_;
+  bool aligned_;
   std::string namespace_;
 };
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
index 56efdb91959ce..b2d754337fe02 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
@@ -176,16 +176,6 @@ def teller1(program_config, predictor_config):
         self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
                            "INPUT RoisNum NOT SUPPORT")
 
-        def teller2(program_config, predictor_config):
-            if (program_config.ops[0].attrs['sampling_ratio'] == -1 and
-                    program_config.ops[0].attrs['aligned'] == True):
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_SUPPORT,
-            "SAMPLING_RATIO EQUAL TO - 1 WHEN ALIGNED IS TRUE IS NOT SUPPORT")
-
     def test(self):
         self.add_skip_trt_case()
         self.run_test()

From a6cf6cddd323436b0e441aeb6f67a9a5da6c2172 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Thu, 13 Jan 2022 14:32:22 +0800
Subject: [PATCH 07/24] [fleet_executor] fix uninitialized pointer (#38904)

---
 paddle/fluid/distributed/fleet_executor/carrier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 7762effdb9c87..9a74fa78c0e76 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -101,8 +101,8 @@ class Carrier final {
   std::mutex running_mutex_;
   std::condition_variable cond_var_;
   std::vector<framework::Scope*> microbatch_scopes_;
-  framework::Scope* root_scope_;
-  framework::Scope* minibatch_scope_;
+  framework::Scope* root_scope_{nullptr};
+  framework::Scope* minibatch_scope_{nullptr};
   paddle::platform::Place place_;
   paddle::platform::DeviceContext* dev_ctx_{nullptr};
   int64_t rank_;

From 53783e1e3d972a5eccb4936ce0ef9ee4aa292a96 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Thu, 13 Jan 2022 14:48:24 +0800
Subject: [PATCH 08/24] [Dist Pass] AMP pass add dist_update_loss_scaling op
 (#38902)

---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/common.py         |   2 +-
 .../operators/dist_update_loss_scaling.py     | 134 ++++++++++++++++++
 3 files changed, 136 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 5502cb3191a48..c28b7930124dd 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -24,3 +24,4 @@
 from . import dist_transpose
 from . import dist_default
 from . import dist_check_finite_and_unscale
+from . import dist_update_loss_scaling
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index 32496b94b920c..8f1ba33f544fb 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -15,7 +15,7 @@
 from ..dist_attribute import OperatorDistributedAttribute
 
 _g_distributed_operator_impl_registries = {}
-BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale'}
+BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
 class DistributedOperatorImplContainer:
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
new file mode 100644
index 0000000000000..56782bec0856a
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import set_dist_op_desc_original_id
+
+
+class DistributedUpdateLossScaling(DistributedOperatorImplContainer):
+    def __init__(self, name):
+        super(DistributedUpdateLossScaling, self).__init__()
+        self._name = name
+
+
+register_distributed_operator_impl_container(
+    "update_loss_scaling", DistributedUpdateLossScaling("update_loss_scaling"))
+
+
+class DistributedUpdateLossScalingImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedUpdateLossScalingImpl, self).__init__()
+        self._name = name
+        self._forward_implemented = False
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's is_input_compatible should not be called !"
+        )
+
+    def is_output_compatible(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's is_output_compatible should not be called !"
+        )
+
+    def update_dims_mapping(self, dist_op):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's update_dims_mapping should not be called !"
+        )
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        raise RuntimeError(
+            "DistributedUpdateLossScalingImpl's forward should not be called !")
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+
+        # the backward function only filte the gradient with current rank id
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.get_dst_main_program().global_block()
+        backward_op = dist_op_context.get_cur_src_op()
+        rank_id = dist_op_context.get_rank_id()
+        dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
+        assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
+            str(backward_op))
+
+        assert rank_id in dist_attr.process_mesh.processes
+
+        assert 'X' in kwargs, "input [{}] is not given".format('X')
+        assert 'FoundInfinite' in kwargs, "input [{}] is not given".format(
+            'FoundInfinite')
+        assert 'PrevLossScaling' in kwargs, "input [{}] is not given".format(
+            'PrevLossScaling')
+        assert 'InGoodSteps' in kwargs, "input [{}] is not given".format(
+            'InGoodSteps')
+        assert 'InBadSteps' in kwargs, "input [{}] is not given".format(
+            'InBadSteps')
+
+        assert 'Out' in kwargs, "output [{}] is not given".format('Out')
+        assert 'LossScaling' in kwargs, "output [{}] is not given".format(
+            'LossScaling')
+        assert 'OutGoodSteps' in kwargs, "input [{}] is not given".format(
+            'OutGoodSteps')
+        assert 'OutBadSteps' in kwargs, "input [{}] is not given".format(
+            'OutBadSteps')
+
+        assert len(kwargs['FoundInfinite']) == 1, \
+            "update_loss_scaling input FoundInfinite take 1 variable but got {}".format(
+            kwargs['FoundInfinite'])
+        assert len(kwargs['PrevLossScaling']) == 1, \
+            "update_loss_scaling input PrevLossScaling take 1 variable but got {}".format(
+            kwargs['PrevLossScaling'])
+        assert len(kwargs['InGoodSteps']) == 1, \
+            "update_loss_scaling input InGoodSteps take 1 variable but got {}".format(
+            kwargs['InGoodSteps'])
+        assert len(kwargs['InBadSteps']) == 1, \
+            "update_loss_scaling input InBadSteps take 1 variable but got {}".format(
+            kwargs['InBadSteps'])
+        assert len(kwargs['LossScaling']) == 1, \
+            "update_loss_scaling output LossScaling take 1 variable but got {}".format(
+            kwargs['LossScaling'])
+        assert len(kwargs['OutGoodSteps']) == 1, \
+            "update_loss_scaling output OutGoodSteps take 1 variable but got {}".format(
+            kwargs['OutGoodSteps'])
+        assert len(kwargs['OutBadSteps']) == 1, \
+            "update_loss_scaling output OutBadSteps take 1 variable but got {}".format(
+            kwargs['OutBadSteps'])
+
+        assert len(kwargs['X']) == len(kwargs['Out']), \
+            "update_loss_scaling got [{}] X and [{}] Out, which are supposed to be equal".format(
+            len(kwargs['X']), len(kwargs['Out']))
+
+        filter_vars = []
+        for varname in kwargs['X']:
+            if rank_id in ctx.get_tensor_dist_attr_for_program(
+                    main_block.var(varname)).process_mesh.processes:
+                filter_vars.append(varname)
+
+        # replicate op in dist program
+        dist_op_desc = main_block.desc.append_op()
+        dist_op_desc.copy_from(backward_op.desc)
+        set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx)
+        dist_op_desc.set_input('X', filter_vars)
+        dist_op_desc.set_output('Out', filter_vars)
+        main_block._sync_with_cpp()
+
+
+register_distributed_operator_impl(
+    "update_loss_scaling",
+    DistributedUpdateLossScalingImpl("update_loss_scaling"))

From 7e0292ead7d8c0632135e5480870e4c6bdf93acd Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 13 Jan 2022 14:51:17 +0800
Subject: [PATCH 09/24] [pten]Remove pten/include dir files (#38878)

* move dot_dev api into dot_kernel.h

* add infermate header

* modify to dotkerel in dot_op.h

* mvoe conj dev api into complex_kernel.h

* move sign dev api into  sign_kernel.h

* move scale dev api into kernel.h and remove infermete.h

* rm paddle/pten/include/math.h

* rm paddle/pten/include/math.h

* rm include dir

* rm paddle/pten/include/math.h

* fix conflict with develop branch

* rm devContext in conj_op.h

* add the missing complex_kernel header
---
 .../eager/accumulation/accumulation_node.cc   |  1 -
 .../accumulation/gradient_accumulation.cc     |  1 -
 .../eager_generated/backwards/scale_node.cc   | 16 +++---
 .../eager_generated/forwards/scale.cc         |  1 -
 paddle/fluid/eager/eager_tensor.h             |  1 -
 paddle/fluid/eager/grad_node_info.h           |  1 -
 .../eager/legacy/infer_var_type_context.h     |  1 -
 paddle/fluid/eager/legacy/prepared_operator.h |  2 -
 paddle/fluid/eager/legacy/tensor_helper.h     |  1 -
 .../framework/data_device_transform_test.cu   |  1 -
 paddle/fluid/framework/operator.h             |  3 +-
 paddle/fluid/imperative/layer.h               |  1 -
 paddle/fluid/imperative/op_base.h             |  1 -
 paddle/fluid/imperative/prepared_operator.h   |  2 -
 paddle/fluid/operators/cast_op.h              |  1 -
 paddle/fluid/operators/conj_op.h              |  3 +-
 paddle/fluid/operators/dot_op.h               |  1 -
 .../elementwise/elementwise_add_op.h          |  1 -
 .../elementwise/elementwise_mul_op.h          |  1 -
 .../elementwise/elementwise_op_function.h     |  1 -
 .../elementwise/elementwise_op_impl.cu.h      |  1 -
 .../elementwise/elementwise_sub_op.h          |  1 -
 paddle/fluid/operators/fill_any_like_op.h     |  1 -
 paddle/fluid/operators/flatten_op.h           |  1 -
 paddle/fluid/operators/matmul_v2_op.h         |  1 -
 paddle/fluid/operators/reduce_ops/reduce_op.h |  2 -
 paddle/fluid/operators/reshape_op.cc          |  1 -
 paddle/fluid/operators/scale_op.h             |  5 +-
 paddle/fluid/operators/sign_op.h              |  1 -
 paddle/fluid/pybind/eager.cc                  |  1 -
 paddle/fluid/pybind/eager_functions.cc        |  1 -
 paddle/fluid/pybind/eager_method.cc           |  1 -
 paddle/fluid/pybind/eager_properties.cc       |  1 -
 paddle/fluid/pybind/eager_utils.cc            |  1 -
 paddle/pten/CMakeLists.txt                    |  2 +-
 paddle/pten/all.cc                            | 17 -------
 paddle/pten/all.h                             | 20 --------
 paddle/pten/api/lib/utils.cc                  |  3 +-
 paddle/pten/include/core.h                    | 22 --------
 paddle/pten/include/infermeta.h               | 21 --------
 paddle/pten/include/math.h                    | 39 ---------------
 paddle/pten/kernels/complex_kernel.h          |  3 --
 paddle/pten/kernels/cpu/scale_kernel.cc       | 34 ++++++++++++-
 paddle/pten/kernels/flatten_kernel.h          |  2 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       | 14 +++---
 .../kernels/impl/matmul_grad_kernel_impl.h    |  3 +-
 paddle/pten/kernels/impl/scale_kernel_impl.h  | 50 -------------------
 paddle/pten/kernels/math_kernel.h             |  3 +-
 paddle/pten/kernels/reshape_kernel.h          |  2 +-
 paddle/pten/kernels/scale_kernel.h            | 28 ++++++++---
 paddle/pten/kernels/sign_kernel.h             |  2 +-
 paddle/pten/tests/api/scale_api.h             | 35 +++++++------
 .../pten/tests/kernels/test_scale_dev_api.cc  |  2 +-
 python/paddle/utils/code_gen/api_gen.py       |  6 ++-
 54 files changed, 103 insertions(+), 265 deletions(-)
 delete mode 100644 paddle/pten/all.cc
 delete mode 100644 paddle/pten/all.h
 delete mode 100644 paddle/pten/include/core.h
 delete mode 100644 paddle/pten/include/infermeta.h
 delete mode 100644 paddle/pten/include/math.h
 delete mode 100644 paddle/pten/kernels/impl/scale_kernel_impl.h

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index ed1146eed0fb0..823c0153d71b0 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -18,7 +18,6 @@
 
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
index 9bc24dd28756a..1f66596a0b578 100644
--- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc
+++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc
@@ -28,7 +28,6 @@
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/include/core.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 #ifdef PADDLE_WITH_XPU
 #include "xpu/refactor/math.h"
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
index 02eaa79fc9b28..99f6c7a83538e 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/eager_tensor.h"
 
-#include "paddle/pten/api/all.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -33,28 +33,28 @@ static void ScaleDeviceDispatch(const pten::DenseTensor& dense_tensor,
                                 pten::DenseTensor* dense_out) {
   switch (dense_tensor.dtype()) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double, DeviceContext>(
+      pten::ScaleKernel<double, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float, DeviceContext>(dev_ctx, dense_tensor /* tensor */,
-                                        scale /* scale */, bias /* bias */,
-                                        bias_after_scale /* bias_after_scale */,
-                                        dense_out /* out tensor */);
+      pten::ScaleKernel<float, DeviceContext>(
+          dev_ctx, dense_tensor /* tensor */, scale /* scale */,
+          bias /* bias */, bias_after_scale /* bias_after_scale */,
+          dense_out /* out tensor */);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t, DeviceContext>(
+      pten::ScaleKernel<int64_t, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t, DeviceContext>(
+      pten::ScaleKernel<int32_t, DeviceContext>(
           dev_ctx, dense_tensor /* tensor */, scale /* scale */,
           bias /* bias */, bias_after_scale /* bias_after_scale */,
           dense_out /* out tensor */);
diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
index 7b20ff144a7a7..642302a4119be 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
+++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/scale.cc
@@ -29,7 +29,6 @@
 #include "paddle/fluid/eager/utils.h"
 
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 80faad9080ffe..c58c0b9e66e7a 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 // pten deps
-#include "paddle/pten/all.h"
 #include "paddle/pten/api/all.h"
 #include "paddle/pten/api/lib/api_declare.h"
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index f15c50ef75190..5cf0b90220148 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 /**
diff --git a/paddle/fluid/eager/legacy/infer_var_type_context.h b/paddle/fluid/eager/legacy/infer_var_type_context.h
index 2d5a8d806fee7..9d9cbeb38ccfa 100644
--- a/paddle/fluid/eager/legacy/infer_var_type_context.h
+++ b/paddle/fluid/eager/legacy/infer_var_type_context.h
@@ -26,7 +26,6 @@
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 
 namespace egr {
 namespace legacy {
diff --git a/paddle/fluid/eager/legacy/prepared_operator.h b/paddle/fluid/eager/legacy/prepared_operator.h
index 9ba186b14e3b3..0e00b52e0481a 100644
--- a/paddle/fluid/eager/legacy/prepared_operator.h
+++ b/paddle/fluid/eager/legacy/prepared_operator.h
@@ -25,8 +25,6 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/type_defs.h"
 
-#include "paddle/pten/include/core.h"
-
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
diff --git a/paddle/fluid/eager/legacy/tensor_helper.h b/paddle/fluid/eager/legacy/tensor_helper.h
index f87ab70c93686..ce407f8965aa0 100644
--- a/paddle/fluid/eager/legacy/tensor_helper.h
+++ b/paddle/fluid/eager/legacy/tensor_helper.h
@@ -17,7 +17,6 @@
 #include <vector>
 #include "paddle/fluid/eager/eager_tensor.h"
 #include "paddle/pten/api/all.h"
-#include "paddle/pten/include/core.h"
 namespace egr {
 namespace legacy {
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index a81e4abd45e56..858688dffd8c1 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/init.h"
 
 #include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8e69f96dfb813..9d75c66beb7d4 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -41,7 +41,8 @@ limitations under the License. */
 #include "paddle/utils/flat_hash_map.h"
 
 #include "paddle/pten/core/arg_map_context.h"
-#include "paddle/pten/include/core.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 199d62bff1f20..d27460aeeccef 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -37,7 +37,6 @@
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
-#include "paddle/pten/include/core.h"
 namespace paddle {
 namespace framework {
 class Variable;
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 3ff451f817872..cb76a82353282 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -25,7 +25,6 @@
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 5262b265b1b53..29747e79ef6fa 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -27,8 +27,6 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-#include "paddle/pten/include/core.h"
-
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 72aa9a195ec7c..c54c811b25b66 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/transform.h"
 
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/cast_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h
index 71115c2eba796..6df982abb8612 100644
--- a/paddle/fluid/operators/conj_op.h
+++ b/paddle/fluid/operators/conj_op.h
@@ -19,7 +19,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/complex_kernel.h"
 
 namespace paddle {
@@ -39,7 +38,7 @@ class ConjKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::ConjKernel<T, DeviceContext>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::ConjKernel<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 8817e2f3ca79d..ceb8a28e8aa4c 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -21,7 +21,6 @@
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/dot_grad_kernel.h"
 #include "paddle/pten/kernels/dot_kernel.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 35807d7c57d47..622a6d7edb783 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 385c7549e07f2..687340b668a13 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 37d29ed91b3d4..626046890fb06 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/transform.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/kernels/cpu/elementwise.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36ff1ae254d20..9cc741344e50e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 09818380d8ea7..f035e46d1d082 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
-// only can include the headers in paddle/pten/include dirs
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 287bbbfa3b343..19f6e7a4ef51f 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/full_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index ef42619bfe4ff..8e54ecb922f5a 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index e93bd212868fd..9ab77cdcaec0a 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -27,7 +27,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/matmul_grad_kernel.h"
 #include "paddle/pten/kernels/matmul_kernel.h"
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e1854d8a13d8b..eb4d4a5c1680e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -26,8 +26,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/math.h"
 #include "paddle/pten/kernels/cpu/reduce.h"
 
 #if defined(__HIPCC__) || defined(__NVCC__)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index a25e53aac5d73..47b8da70adbac 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
 #include "paddle/pten/common/scalar_array.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/reshape_grad_kernel.h"
 #include "paddle/pten/kernels/reshape_kernel.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 6011fe9a66b60..a6f4f6e27204e 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
@@ -70,8 +69,8 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    pten::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
-                   pt_out.get());
+    pten::ScaleKernel<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                         pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b8dd44c01b050..8294cd2c5f145 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-#include "paddle/pten/include/core.h"
 #include "paddle/pten/kernels/sign_kernel.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 9484d506b20fb..102bc9f162b0f 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 659df6b9b44de..aaf86bc41aeff 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -34,7 +34,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index a0067f9c64fb1..a8c1da2a8b866 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -31,7 +31,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 71b8bbbb1a283..038a1254d7ef6 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9849d0d41611b..c1049d240795c 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/core.h"
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 6a823ff3672bf..a9b7c7581bc2b 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -29,4 +29,4 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 message(STATUS "All standard pten kernels: ${pten_kernels}")
 set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
 
-cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
+cc_library(pten DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/all.cc b/paddle/pten/all.cc
deleted file mode 100644
index d8d96e1cd461e..0000000000000
--- a/paddle/pten/all.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/all.h"
-
-namespace pten {}  // namespace pten
diff --git a/paddle/pten/all.h b/paddle/pten/all.h
deleted file mode 100644
index c8be629b10e75..0000000000000
--- a/paddle/pten/all.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// developer apis
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/include/math.h"
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index ddb29c8833f3b..6eb1e5a3797c9 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 
 PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
diff --git a/paddle/pten/include/core.h b/paddle/pten/include/core.h
deleted file mode 100644
index 9a042753d1f73..0000000000000
--- a/paddle/pten/include/core.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_factory.h"
-#include "paddle/pten/core/tensor_meta.h"
diff --git a/paddle/pten/include/infermeta.h b/paddle/pten/include/infermeta.h
deleted file mode 100644
index 5e356dd37c03e..0000000000000
--- a/paddle/pten/include/infermeta.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/infermeta/binary.h"
-#include "paddle/pten/infermeta/multiary.h"
-#include "paddle/pten/infermeta/nullary.h"
-#include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
deleted file mode 100644
index a4fb7f4d98faf..0000000000000
--- a/paddle/pten/include/math.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/api/lib/utils/storage.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/scale_kernel.h"
-
-namespace pten {
-
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const Scalar& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  Scale<T, ContextT>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index 9dd3d457e4a26..b6074f117ea14 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -15,9 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/kernels/empty_kernel.h"
-
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index fe9a0a033bced..0582fb87b4457 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -13,18 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/kernels/scale_kernel.h"
-#include "paddle/pten/kernels/impl/scale_kernel_impl.h"
 
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
+  // calc
+  out->mutable_data<T>();
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+  // TODO(chenweihang): now the eigen function here need the dtype of scale,
+  // eigen_x, bias should be same, so here need cast for two scalar arg,
+  // maybe we declare that the type of scale and bias is T?
+  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      eigen_out,
+      eigen_x,
+      scale.to<T>(),
+      static_cast<T>(bias),
+      bias_after_scale);
+}
+
+}  // namespace pten
 
 PT_REGISTER_CTX_KERNEL(scale,
                        CPU,
                        ALL_LAYOUT,
-                       pten::Scale,
+                       pten::ScaleKernel,
                        float,
                        double,
                        paddle::platform::bfloat16,
diff --git a/paddle/pten/kernels/flatten_kernel.h b/paddle/pten/kernels/flatten_kernel.h
index a67e66fac4130..c974fda1ed363 100644
--- a/paddle/pten/kernels/flatten_kernel.h
+++ b/paddle/pten/kernels/flatten_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 68574c063e77f..ff7e2a6ed284c 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -44,12 +44,12 @@ struct ScaleFunctor {
 };
 
 template <typename T, typename ContextT>
-void Scale(const ContextT& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
+void ScaleKernel(const ContextT& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out) {
   std::vector<const DenseTensor*> inputs;
   std::vector<DenseTensor*> outputs;
   inputs.emplace_back(&x);
@@ -67,7 +67,7 @@ void Scale(const ContextT& dev_ctx,
 PT_REGISTER_CTX_KERNEL(scale,
                        GPU,
                        ALL_LAYOUT,
-                       pten::Scale,
+                       pten::ScaleKernel,
                        float,
                        double,
                        paddle::platform::float16,
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index 802cc019d78c5..b1bae78ddc5fa 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-// #include "paddle/pten/kernels/complex_kernel.h"
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/complex_kernel.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/impl/dot_grad_kernel_impl.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
diff --git a/paddle/pten/kernels/impl/scale_kernel_impl.h b/paddle/pten/kernels/impl/scale_kernel_impl.h
deleted file mode 100644
index 2e0b158b36b8d..0000000000000
--- a/paddle/pten/kernels/impl/scale_kernel_impl.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-namespace pten {
-
-template <typename T, typename Context>
-void Scale(const Context& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  // calc
-  out->mutable_data<T>();
-  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pten::EigenVector<T>::Flatten(x);
-  auto& dev = *dev_ctx.eigen_device();
-  // TODO(chenweihang): now the eigen function here need the dtype of scale,
-  // eigen_x, bias should be same, so here need cast for two scalar arg,
-  // maybe we declare that the type of scale and bias is T?
-  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-      dev,
-      eigen_out,
-      eigen_x,
-      scale.to<T>(),
-      static_cast<T>(bias),
-      bias_after_scale);
-}
-
-}  // namespace pten
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index f87d0a31b470b..e01103fc5b847 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/reshape_kernel.h b/paddle/pten/kernels/reshape_kernel.h
index faa51c69ad17c..293f6cd2baf61 100644
--- a/paddle/pten/kernels/reshape_kernel.h
+++ b/paddle/pten/kernels/reshape_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index 5908050029c7a..ba16db566b8bb 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -16,15 +16,29 @@ limitations under the License. */
 
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
-
+#include "paddle/pten/infermeta/unary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
 namespace pten {
 
 template <typename T, typename Context>
-void Scale(const Context& dev_ctx,
-           const DenseTensor& x,
-           const Scalar& scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out);
+void ScaleKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const Scalar& scale,
+                 float bias,
+                 bool bias_after_scale,
+                 DenseTensor* out);
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const Scalar& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferMeta(x.meta());
+  auto dense_out = pten::Empty<T, ContextT>(dev_ctx, std::move(out_meta));
+  ScaleKernel<T, ContextT>(
+      dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/sign_kernel.h b/paddle/pten/kernels/sign_kernel.h
index ba205fc96a15c..304b640d2af69 100644
--- a/paddle/pten/kernels/sign_kernel.h
+++ b/paddle/pten/kernels/sign_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 
 namespace pten {
diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h
index d525b305c7409..41143826c45d8 100644
--- a/paddle/pten/tests/api/scale_api.h
+++ b/paddle/pten/tests/api/scale_api.h
@@ -23,8 +23,7 @@
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/scale_kernel.h"
 
 namespace paddle {
@@ -92,42 +91,42 @@ static void ScaleCPU(DataType kernel_dtype,
                      pten::DenseTensor* dense_out) {
   switch (kernel_dtype) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double>(
+      pten::ScaleKernel<double>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float>(
+      pten::ScaleKernel<float>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::BFLOAT16: {
-      pten::Scale<paddle::platform::bfloat16>(
+      pten::ScaleKernel<paddle::platform::bfloat16>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t>(
+      pten::ScaleKernel<int64_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t>(
+      pten::ScaleKernel<int32_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT16: {
-      pten::Scale<int16_t>(
+      pten::ScaleKernel<int16_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT8: {
-      pten::Scale<int8_t>(
+      pten::ScaleKernel<int8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::UINT8: {
-      pten::Scale<uint8_t>(
+      pten::ScaleKernel<uint8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
@@ -151,42 +150,42 @@ static void ScaleGPU(DataType kernel_dtype,
                      pten::DenseTensor* dense_out) {
   switch (kernel_dtype) {
     case pten::DataType::FLOAT64: {
-      pten::Scale<double>(
+      pten::ScaleKernel<double>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT32: {
-      pten::Scale<float>(
+      pten::ScaleKernel<float>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::FLOAT16: {
-      pten::Scale<paddle::platform::float16>(
+      pten::ScaleKernel<paddle::platform::float16>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT64: {
-      pten::Scale<int64_t>(
+      pten::ScaleKernel<int64_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT32: {
-      pten::Scale<int32_t>(
+      pten::ScaleKernel<int32_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT16: {
-      pten::Scale<int16_t>(
+      pten::ScaleKernel<int16_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::INT8: {
-      pten::Scale<int8_t>(
+      pten::ScaleKernel<int8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
     case pten::DataType::UINT8: {
-      pten::Scale<uint8_t>(
+      pten::ScaleKernel<uint8_t>(
           dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out);
       break;
     }
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index ac2922b36f205..fe26f56552b05 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/include/math.h"
+#include "paddle/pten/kernels/scale_kernel.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 35720ae32fe38..e8539b11d1455 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,8 +345,10 @@ def source_include(header_file_path):
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/infermeta/nullary.h"
+#include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/declarations.h"
 """
 

From 23aa7b08d18d9b6a3e80d6bc31d71b481719b0bd Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Thu, 13 Jan 2022 15:04:01 +0800
Subject: [PATCH 10/24] force close eager_generator.exe (#38896)

* force close eager_generator.exe

* modify according to zhouwei's comment
---
 paddle/scripts/paddle_build.bat | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index ca34b12b5d4f8..343ab8ff9f5b7 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -42,7 +42,11 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -509,8 +513,12 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
-wmic process where name="cmake.exe" call terminate 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
+wmic process where name="cmake.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL
@@ -972,7 +980,11 @@ taskkill /f /im nvcc.exe /t 2>NUL
 taskkill /f /im cicc.exe /t 2>NUL
 taskkill /f /im ptxas.exe /t 2>NUL
 taskkill /f /im op_function_generator.exe /t 2>NUL
+taskkill /f /im eager_generator.exe /t 2>NUL
+taskkill /f /im eager_op_function_generator.exe /t 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="eager_generator.exe" call terminate 2>NUL
+wmic process where name="eager_op_function_generator.exe" call terminate 2>NUL
 wmic process where name="cvtres.exe" call terminate 2>NUL
 wmic process where name="rc.exe" call terminate 2>NUL
 wmic process where name="cl.exe" call terminate 2>NUL

From 7a5af6306bb3f34ea951203e5e36419c0be9ac11 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 13 Jan 2022 16:31:19 +0800
Subject: [PATCH 11/24] [NPU] fix expand op (#38526)

* [NPU] fix expand op

* [NPU] optimize codes

* [NPU] optimize codes
---
 paddle/fluid/operators/expand_op_npu.cc       | 26 +++++++++++++++----
 .../tests/unittests/npu/test_expand_op_npu.py | 21 +++++++++++++++
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 8ecdd5e8cb695..e9f31f8ddd698 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -81,14 +81,30 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
       out_dims[i] *= expand_times[i];
     }
 
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.device_context().GetPlace());
-    const auto& runner =
-        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto place = context.GetPlace();
     auto stream =
         context.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    runner.Run(stream);
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(place);
+
+    bool is_expand_times_all_one =
+        (out0->numel() == in0->numel()) ? true : false;
+
+    if (is_expand_times_all_one) {
+      memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place),
+                   out0->mutable_data<T>(place),
+                   BOOST_GET_CONST(platform::NPUPlace, place), in0->data<T>(),
+                   in0->numel() * sizeof(T), stream);
+      if (out_dims != in_dims) {
+        out0->Resize(out_dims);
+      }
+    } else {
+      const auto& runner =
+          NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+      runner.Run(stream);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index 375003f79e500..89ac9e09aa348 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -132,5 +132,26 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
+# ------------------------------------------------
+# Special Cases for NPU
+# ------------------------------------------------
+
+
+class TestExpand_expand_times_all_one(TestExpand):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 1, 1])
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        self.outputs = {'Out': out}
+
+
 if __name__ == '__main__':
     unittest.main()

From eaccdc71dd04b1f42ceac170c82754dd0a953867 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 13 Jan 2022 16:34:17 +0800
Subject: [PATCH 12/24] [NPU] fix tril_triu (#38864)

[NPU] fix tril_triu
---
 paddle/fluid/operators/tril_triu_op_npu.cc    | 41 ++++++++++++++++---
 .../unittests/npu/test_tril_triu_op_npu.py    | 16 +++++++-
 2 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
index ab7a9035fb974..02af711567f84 100644
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ b/paddle/fluid/operators/tril_triu_op_npu.cc
@@ -33,12 +33,41 @@ class TrilTriuNPUKernel : public framework::OpKernel<T> {
 
     framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}};
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    const auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
 
-    const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
-    runner.Run(stream);
+    auto op_func_tril = [](const std::vector<Tensor>& inputs,
+                           const std::vector<Tensor>& outputs,
+                           const NPUAttributeMap& attrs,
+                           const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+
+    auto op_func_triu = [](const std::vector<Tensor>& inputs,
+                           const std::vector<Tensor>& outputs,
+                           const NPUAttributeMap& attrs,
+                           const platform::NPUDeviceContext& dev_ctx) {
+      const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs);
+      runner.Run(dev_ctx.stream());
+    };
+
+    if (x->type() == framework::proto::VarType::BOOL) {
+      if (lower) {
+        NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx,
+                                 op_func_tril,
+                                 {framework::proto::VarType::UINT8},
+                                 {framework::proto::VarType::UINT8});
+      } else {
+        NpuOpRunner::TypeAdapter({*x}, {*out}, attr_input, dev_ctx,
+                                 op_func_triu,
+                                 {framework::proto::VarType::UINT8},
+                                 {framework::proto::VarType::UINT8});
+      }
+    } else {
+      const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
+      runner.Run(dev_ctx.stream());
+    }
   }
 };
 
@@ -49,4 +78,6 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
     tril_triu, ops::TrilTriuNPUKernel<plat::NPUDeviceContext, float>,
+    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, int>,
+    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, bool>,
     ops::TrilTriuNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
index 13adc25a38ca5..8239dd4f3fa89 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
@@ -15,7 +15,7 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
 import paddle.tensor as tensor
@@ -187,5 +187,19 @@ def test_fluid_api(self):
                                    fetch_list=[triu_out])
 
 
+# @skip_check_grad_ci(reason="[NPU does not support grad right now.")
+class TestNPUTrilTriu_bool(TestNPUTrilTriu):
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def init_dtype(self):
+        self.dtype = np.bool
+
+    def initTestCase(self):
+        self.real_op_type = np.random.choice(['triu', 'tril'])
+        self.diagonal = None
+        self.X = np.random.choice([False, True], size=(100)).reshape([10, -1])
+
+
 if __name__ == '__main__':
     unittest.main()

From 7f1234563ff3aab32168a6fbaeb57d73748981c3 Mon Sep 17 00:00:00 2001
From: shangliang Xu <ghostxsl@users.noreply.github.com>
Date: Thu, 13 Jan 2022 17:24:53 +0800
Subject: [PATCH 13/24] [bug fix] fix unfold bug in compile time (#38907)

---
 paddle/fluid/operators/unfold_op.cc | 35 +++++++++++++----------------
 paddle/fluid/operators/unfold_op.h  | 10 +--------
 2 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 3f580884aa515..5a8e7e3efbe82 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -143,22 +143,18 @@ class UnfoldOp : public framework::OperatorWithKernel {
             "but recieved dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
-    bool contain_unknown_dim = framework::contain_unknown_dim(in_dims);
-    bool check = ctx->IsRuntime() || !contain_unknown_dim;
-    if (check) {
-      std::vector<int> out_dims;
-      out_dims.push_back(in_dims[0]);
-
-      int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
-      out_dims.push_back(output_channels);
-
-      int output_height =
-          CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
-                         paddings[2], strides[0]);
-      int output_width =
-          CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], paddings[1],
-                         paddings[3], strides[1]);
-      // check output height and width
+    std::vector<int> out_dims;
+    out_dims.push_back(in_dims[0]);
+    int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+    out_dims.push_back(output_channels);
+
+    int output_height =
+        CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0],
+                       paddings[2], strides[0]);
+    int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1],
+                                      paddings[1], paddings[3], strides[1]);
+    if (ctx->IsRuntime()) {
+      // only check output height and width in runtime
       PADDLE_ENFORCE_GT(
           output_height, 0,
           platform::errors::InvalidArgument(
@@ -179,11 +175,10 @@ class UnfoldOp : public framework::OperatorWithKernel {
               in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1],
               strides[0], strides[1], dilations[0], dilations[1], output_height,
               output_width));
-      int output_col_length = output_height * output_width;
-      out_dims.push_back(output_col_length);
-
-      ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
     }
+    int output_col_length = output_height * output_width;
+    out_dims.push_back(output_col_length);
+    ctx->SetOutputDim("Y", framework::make_ddim(out_dims));
   }
 
  protected:
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
index f22559f1f38c2..006e4822fead0 100644
--- a/paddle/fluid/operators/unfold_op.h
+++ b/paddle/fluid/operators/unfold_op.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <vector>
+
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -29,15 +30,6 @@ inline int CalcOutputSize(int input_size, int filter_size, int dilation,
                           int padding1, int padding2, int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
   int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
-  PADDLE_ENFORCE_GT(
-      output_size, 0UL,
-      platform::errors::InvalidArgument(
-          "Due to the settings of padding(%d, %d), filter_size(%d), "
-          "dilation(%d) and "
-          "stride(%d), the output size is less than 0, please check "
-          "again. Input_size:%d",
-          padding1, padding2, filter_size, dilation, stride, input_size));
-
   return output_size;
 }
 

From dccdc719ebd863db342c3ef1c8794be2ee391348 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Thu, 13 Jan 2022 19:33:45 +0800
Subject: [PATCH 14/24] [Paddle-Inference] add Paddle Trt config:
 with_interleaved (#38884)

* add Paddle Trt config: with_interleaved
---
 paddle/fluid/inference/analysis/argument.h    |  1 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |  1 +
 paddle/fluid/inference/api/analysis_config.cc |  3 +
 .../fluid/inference/api/analysis_predictor.cc |  7 +++
 .../inference/api/paddle_analysis_config.h    |  2 +
 paddle/fluid/inference/api/paddle_api.h       | 21 +++++++
 .../inference/api/paddle_inference_api.h      | 16 -----
 .../tensorrt/convert/batch_norm_op.cc         | 17 ++++--
 .../tensorrt/convert/elementwise_op.cc        | 14 +++--
 .../inference/tensorrt/convert/gather_op.cc   |  2 +
 .../inference/tensorrt/convert/op_converter.h | 58 ++++++++++++-------
 .../inference/tensorrt/convert/scale_op.cc    | 16 +++++
 .../inference/tensorrt/convert/slice_op.cc    | 30 +++++-----
 paddle/fluid/inference/tensorrt/engine.h      |  5 ++
 15 files changed, 136 insertions(+), 59 deletions(-)

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index aff2f60551de9..175bc55dcff17 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,7 @@ struct Argument {
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_with_interleaved, TensorRtWithInterleaved, bool);
   DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
                       TensorRtShapeRangeInfoPath, std::string);
   DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index dcbbee97a772c..3abda782ab6cf 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -108,6 +108,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
       pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+      pass->Set("with_interleaved",
+                new bool(argument->tensorrt_with_interleaved()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index a21118e23aa5c..ef50df3084f8c 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -369,6 +369,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
                   opt_input_shape, disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
+  trt_engine->SetWithInterleaved(Get<bool>("with_interleaved"));
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a1ab69906bfc4..273690719336c 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -189,6 +189,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_with_interleaved_);
   CP_MEMBER(trt_tuned_dynamic_shape_);
   CP_MEMBER(trt_allow_build_at_runtime_);
   CP_MEMBER(collect_shape_range_info_);
@@ -864,6 +865,8 @@ std::string AnalysisConfig::Summary() {
                                                         : "false"});
 
       os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
+      os.InsertRow({"tensorrt_with_interleaved",
+                    trt_with_interleaved_ ? "true" : "false"});
       os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
       if (trt_use_dla_) {
         os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)});
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 929984f50a7b8..2799fb9e174d3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -605,6 +605,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
     argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
     argument_.SetMinInputShape(config_.min_input_shape_);
     argument_.SetMaxInputShape(config_.max_input_shape_);
     argument_.SetOptimInputShape(config_.optim_input_shape_);
@@ -1603,5 +1604,11 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
 #endif
   return false;
 }
+void InternalUtils::UpdateConfigInterleaved(paddle_infer::Config *c,
+                                            bool with_interleaved) {
+#ifdef PADDLE_WITH_CUDA
+  c->trt_with_interleaved_ = with_interleaved;
+#endif
+}
 }  // namespace experimental
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 77409f95b042e..f65170daccb62 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -796,6 +796,7 @@ struct PD_INFER_DECL AnalysisConfig {
   bool trt_use_static_engine_{false};
   bool trt_use_calib_mode_{true};
   bool trt_use_oss_{false};
+  bool trt_with_interleaved_{false};
   bool trt_use_dla_{false};
   int trt_dla_core_{0};
   std::map<std::string, std::vector<int>> min_input_shape_{};
@@ -883,6 +884,7 @@ struct PD_INFER_DECL AnalysisConfig {
   // So we release the memory when the predictor is set up.
   mutable bool is_valid_{true};
   std::string opt_cache_dir_;
+  friend class paddle_infer::experimental::InternalUtils;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index b137b7ba6f97e..c129efe494b4f 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -405,3 +405,24 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
     const std::string& config_file);
 
 }  // namespace paddle
+
+// forward declation
+using cudaStream_t = struct CUstream_st*;
+using hipStream_t = struct ihipStream_t*;
+
+namespace paddle_infer {
+class Predictor;
+using Config = paddle::AnalysisConfig;
+namespace experimental {
+class PD_INFER_DECL InternalUtils {
+ public:
+  // Note: Can only be used under thread_local semantics.
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    cudaStream_t stream);
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    hipStream_t stream);
+  static void UpdateConfigInterleaved(paddle_infer::Config* c,
+                                      bool with_interleaved);
+};
+}  // namespace experimental
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index b2b9f2e407478..65906a57f46cb 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -41,27 +41,11 @@ limitations under the License. */
 /// \since 2.0.0-beta
 ///
 
-// forward declation
-using cudaStream_t = struct CUstream_st*;
-using hipStream_t = struct ihipStream_t*;
-
 namespace paddle_infer {
 
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
-class Predictor;
-namespace experimental {
-class PD_INFER_DECL InternalUtils {
- public:
-  // Note: Can only be used under thread_local semantics.
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    cudaStream_t stream);
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    hipStream_t stream);
-};
-}  // namespace experimental
-
 ///
 /// \class Predictor
 ///
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 71a2fa68f1749..0e66165191474 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -45,7 +45,7 @@ class BatchNormOpConverter : public OpConverter {
     auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front());
     auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front());
     const float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-
+    auto output_name = op_desc.Output("Y").front();
     PADDLE_ENFORCE_NOT_NULL(
         Bias_v,
         platform::errors::NotFound(
@@ -145,6 +145,10 @@ class BatchNormOpConverter : public OpConverter {
       expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
       expand_layer->setReshapeDimensions(expand_shape);
       X = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("reshape_before_batchnorm_out: " + output_name).c_str());
+      expand_layer->setName(
+          ("BN_Shuffle: (Output: " + output_name + ")").c_str());
     }
 
     layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X,
@@ -152,12 +156,13 @@ class BatchNormOpConverter : public OpConverter {
                                  shift_weights.get(), scale_weights.get(),
                                  power_weights.get(), dynamic_shape_offset);
 
-    auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(),
                         std::move(combile_bias_tensor));
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
     if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      layer->getOutput(0)->setName("batch_norm_out");
+      layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str());
       nvinfer1::Dims squeeze_shape;
       squeeze_shape.nbDims = x_dim.nbDims;
       for (int i = 0; i < squeeze_shape.nbDims; i++) {
@@ -166,10 +171,12 @@ class BatchNormOpConverter : public OpConverter {
       squeeze_layer =
           TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
       squeeze_layer->setReshapeDimensions(squeeze_shape);
-      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      RreplenishLayerAndOutput(squeeze_layer, "batchnorm_add_scale",
+                               {output_name}, test_mode);
+    } else {
+      RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
+                               test_mode);
     }
-    RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
-                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 7c5af43816c44..33f732c19a875 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -50,6 +50,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                         op_desc.Input("Y").front().c_str()));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     float* weight_data = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
     weight_data =
         engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
     nvinfer1::Dims dims_x = X->getDimensions();
@@ -80,6 +81,10 @@ class ElementwiseWeightOpConverter : public OpConverter {
         expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
         expand_layer->setReshapeDimensions(expand_shape);
         X = expand_layer->getOutput(0);
+        expand_layer->getOutput(0)->setName(
+            ("elementwise_reshape_out: " + output_name).c_str());
+        expand_layer->setName(
+            ("Elewise: Shuffle: (Output: " + output_name + ")").c_str());
       }
       if (op_type_ == "add") {
         nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
@@ -101,11 +106,12 @@ class ElementwiseWeightOpConverter : public OpConverter {
         squeeze_layer =
             TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
         squeeze_layer->setReshapeDimensions(squeeze_shape);
-        layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+        RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
+      } else {
+        RreplenishLayerAndOutput(layer, "elementwise_" + op_type_,
+                                 {output_name}, test_mode);
       }
-      auto output_name = op_desc.Output("Out")[0];
-      RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
-                               test_mode);
       if (op_desc.HasAttr("enable_int8")) {
 #if IS_TRT_VERSION_GE(5000)
         CHECK(op_desc.HasAttr("X_scale"));
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
index e7b82388b6ab8..a98e7535de1b8 100644
--- a/paddle/fluid/inference/tensorrt/convert/gather_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
@@ -56,6 +56,8 @@ class GatherOpConverter : public OpConverter {
     index_shape.d[0] = -1;
 
     reshape_layer->setReshapeDimensions(index_shape);
+    reshape_layer->setName(
+        ("Gather: Shuffle: (Output: " + output_name + ")").c_str());
 
     auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor,
                                       *reshape_layer->getOutput(0), axis);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 57a26aec6ebcb..7e0c8bf1da177 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -144,28 +144,44 @@ class OpConverter {
     it->SetEngine(engine);
     (*it)(op, scope, test_mode);
 
-    bool has_out_scale = op_desc.HasAttr("out_threshold");
-    if (has_out_scale) {
-      float out_scale =
-          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-      std::string output_name = "";
-      if (op_desc.HasOutput("Output")) {
-        output_name = op_desc.Output("Output").front();
-      } else if (op_desc.HasOutput("Out")) {
-        output_name = op_desc.Output("Out").front();
-      } else if (op_desc.HasOutput("Y")) {
-        output_name = op_desc.Output("Y").front();
-      } else {
-        PADDLE_THROW(
-            platform::errors::NotFound("Op %s has out threshold but doesn't "
-                                       "have an output named \"Output\", "
-                                       "\"Out\" or \"Y\".",
-                                       op_desc.Type()));
+    size_t output_num = op_desc.OutputNames().size();
+    if (output_num == 1) {  // The number of output is 1
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        std::string output_name = "";
+        if (op_desc.HasOutput("Output")) {
+          output_name = op_desc.Output("Output").front();
+        } else if (op_desc.HasOutput("Out")) {
+          output_name = op_desc.Output("Out").front();
+        } else if (op_desc.HasOutput("Y")) {
+          output_name = op_desc.Output("Y").front();
+        } else {
+          PADDLE_THROW(
+              platform::errors::NotFound("Op %s has out threshold but doesn't "
+                                         "have an output named \"Output\", "
+                                         "\"Out\" or \"Y\".",
+                                         op_desc.Type()));
+        }
+        auto* output_itensor = engine->GetITensor(output_name);
+        engine->SetTensorDynamicRange(output_itensor, out_scale);
+        VLOG(1) << "Set out scale = " << out_scale << " for tensor "
+                << output_name << ".";
+      }
+    } else if (output_num > 1) {  // The number of outputs greater than 1
+      for (size_t i = 0; i < output_num; ++i) {
+        if (op_desc.HasAttr("out_" + std::to_string(i) + "_threshold")) {
+          float out_scale = BOOST_GET_CONST(
+              float,
+              op_desc.GetAttr("out_" + std::to_string(i) + "_threshold"));
+          std::string output_name =
+              op_desc.Output(op_desc.OutputNames()[i]).front();
+          auto* output_itensor = engine->GetITensor(output_name);
+          engine->SetTensorDynamicRange(output_itensor, out_scale);
+          VLOG(1) << "Set out scale = " << out_scale << " for tensor "
+                  << output_name << ".";
+        }
       }
-      auto* output_itensor = engine->GetITensor(output_name);
-      engine->SetTensorDynamicRange(output_itensor, out_scale);
-      VLOG(1) << "Set out scale = " << out_scale << " for tensor "
-              << output_name << ".";
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
index b527f2db53808..8b23a8161f593 100644
--- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc
@@ -89,21 +89,34 @@ class ScaleOpConverter : public OpConverter {
       expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
       expand_layer->setReshapeDimensions(expand_shape);
       input = expand_layer->getOutput(0);
+      expand_layer->getOutput(0)->setName(
+          ("before_reshape_out: " + out_name).c_str());
+      expand_layer->setName(
+          ("Scale: before_reshape (Output: " + out_name + ")").c_str());
     }
 
     if (bias_after_scale) {
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
           shift_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_after_scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale (Output: " + out_name + ")").c_str());
     } else {
       // add bias
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM,
           shift_weights.get(), power_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：bias_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_bias (Output: " + out_name + ")").c_str());
       // mul scale
       layer = TRT_ENGINE_ADD_LAYER(
           engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM,
           power_weights.get(), scale_weights.get(), power_weights.get());
+      layer->getOutput(0)->setName(
+          ("bias_before_scale：scale_out: " + out_name).c_str());
+      layer->setName(("Scale: scale_scale (Output: " + out_name + ")").c_str());
     }
 
     PADDLE_ENFORCE_EQ(layer != nullptr, true,
@@ -119,6 +132,9 @@ class ScaleOpConverter : public OpConverter {
           TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
       squeeze_layer->setReshapeDimensions(squeeze_shape);
       layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      layer->getOutput(0)->setName(("after_reshape_out: " + out_name).c_str());
+      layer->setName(
+          ("Scale: Shuffle_reshape (Output: " + out_name + ")").c_str());
     }
     RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 7f270b1f390b7..2c08f0fe2bded 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -30,10 +30,11 @@ class SliceOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
+    auto output_name = op_desc.Output("Out")[0];
 
+    float out_scale = 1;
     if (op_desc.HasAttr("out_threshold")) {
-      float out_scale =
-          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
       engine_->SetTensorDynamicRange(input, out_scale);
     }
 
@@ -71,12 +72,22 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
       if (engine_->use_oss() && engine_->with_ernie()) {
         std::vector<nvinfer1::ITensor*> plugin_inputs;
-        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
-        plugin_inputs.emplace_back(input);
-
+        if (engine_->with_interleaved()) {
+          auto* shuffler_slice = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          nvinfer1::Permutation transpose_embed{2, 1, 0, 3};
+          shuffler_slice->setSecondTranspose(transpose_embed);
+          engine_->SetTensorDynamicRange(shuffler_slice->getOutput(0),
+                                         out_scale);
+          shuffler_slice->setName(
+              ("SpecialSlice_interleaved: Shuffle: (Output: " + output_name +
+               ")")
+                  .c_str());
+          plugin_inputs.emplace_back(shuffler_slice->getOutput(0));
+        } else {
+          plugin_inputs.emplace_back(input);
+        }
         std::string pos_name;
         if (engine_->Has("ernie_pos_name")) {
           pos_name = engine_->Get<std::string>("ernie_pos_name");
@@ -99,11 +110,6 @@ class SliceOpConverter : public OpConverter {
             new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
         layer = engine_->AddDynamicPlugin(&input, 1, plugin);
       }
-#else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
-#endif
     } else {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -111,8 +117,6 @@ class SliceOpConverter : public OpConverter {
           new plugin::SlicePlugin(starts, ends, axes, with_fp16);
       layer = engine_->AddPlugin(&input, 1, plugin);
     }
-
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 7aaeb739de194..663534feda1a8 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -407,6 +407,9 @@ class TensorRTEngine {
   void SetUseDLA(bool use_dla) { use_dla_ = use_dla; }
   void SetDLACore(int dla_core) { dla_core_ = dla_core; }
   void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
+  void SetWithInterleaved(bool with_interleaved) {
+    with_interleaved_ = with_interleaved;
+  }
 
   void ClearWeights() {
     for (auto& weight_pair : weight_map) {
@@ -480,6 +483,7 @@ class TensorRTEngine {
 
   bool use_oss() { return use_oss_; }
   bool with_ernie() { return with_ernie_; }
+  bool with_interleaved() { return with_interleaved_; }
   bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
   bool with_dynamic_shape() { return with_dynamic_shape_; }
   AnalysisConfig::Precision precision() { return precision_; }
@@ -612,6 +616,7 @@ class TensorRTEngine {
   bool use_dla_{false};
   int dla_core_{0};
   bool with_ernie_{false};
+  bool with_interleaved_{false};
   nvinfer1::ILogger& logger_;
 
   // max data size for the buffers.

From 158bf13f1c133c6af77674560e33413be552d51f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 13 Jan 2022 19:52:33 +0800
Subject: [PATCH 15/24] [PTen] Rename kernel register marco (#38861)

* rename register marco

* fix error changing

* fix format error
---
 cmake/pten_kernel.cmake                       |   6 +-
 paddle/pten/core/kernel_registry.h            | 820 +++---------------
 paddle/pten/kernels/cpu/cast_kernel.cc        |  30 +-
 paddle/pten/kernels/cpu/complex_kernel.cc     |  20 +-
 paddle/pten/kernels/cpu/dot_grad_kernel.cc    |  20 +-
 paddle/pten/kernels/cpu/dot_kernel.cc         |  20 +-
 paddle/pten/kernels/cpu/full_kernel.cc        |  50 +-
 paddle/pten/kernels/cpu/math_kernel.cc        | 108 +--
 paddle/pten/kernels/cpu/matmul_grad_kernel.cc |  52 +-
 paddle/pten/kernels/cpu/matmul_kernel.cc      |  16 +-
 paddle/pten/kernels/cpu/scale_kernel.cc       |  24 +-
 paddle/pten/kernels/cpu/sign_kernel.cc        |   3 +-
 paddle/pten/kernels/empty_kernel.cc           | 116 +--
 paddle/pten/kernels/flatten_grad_kernel.cc    |  60 +-
 paddle/pten/kernels/flatten_kernel.cc         | 120 +--
 paddle/pten/kernels/gpu/cast_kernel.cu        |  36 +-
 paddle/pten/kernels/gpu/complex_kernel.cu     |  22 +-
 paddle/pten/kernels/gpu/dot_grad_kernel.cu    |  20 +-
 paddle/pten/kernels/gpu/dot_kernel.cu         |  20 +-
 paddle/pten/kernels/gpu/full_kernel.cu        |  48 +-
 paddle/pten/kernels/gpu/math_kernel.cu        | 116 +--
 paddle/pten/kernels/gpu/matmul_grad_kernel.cu |  58 +-
 paddle/pten/kernels/gpu/matmul_kernel.cu      |  18 +-
 paddle/pten/kernels/gpu/scale_kernel.cu       |  24 +-
 paddle/pten/kernels/gpu/sign_kernel.cu        |   2 +-
 25 files changed, 636 insertions(+), 1193 deletions(-)

diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
index f962c1332093a..bc9fefb58f452 100644
--- a/cmake/pten_kernel.cmake
+++ b/cmake/pten_kernel.cmake
@@ -16,12 +16,12 @@
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
+        # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
         # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_CTX_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
             # parse the first kernel name
-            string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
             string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index f08ef4acfd9ce..194ab52d25688 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -213,20 +213,20 @@ struct KernelRegistrar {
  * pointer of the corresponding data type is automatically instantiated
  * during registration.
  *
- * Note: `1TA` means `1 template argument`
+ * Note: `2TA` means `2 template argument`
  */
 #define PT_REGISTER_KERNEL(                                                \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)          \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
       pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  _PT_REGISTER_1TA_KERNEL(                                                 \
+  _PT_REGISTER_2TA_KERNEL(                                                 \
       kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 #ifndef _WIN32
-#define _PT_REGISTER_1TA_KERNEL(                                            \
+#define _PT_REGISTER_2TA_KERNEL(                                            \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);          \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       ::pten::Kernel*);                                                     \
   PT_KERNEL_REGISTRAR_INIT(                                                 \
@@ -252,7 +252,7 @@ struct KernelRegistrar {
  *
  * And msvc can work without template instantiation
  */
-#define _PT_REGISTER_1TA_KERNEL(                                            \
+#define _PT_REGISTER_2TA_KERNEL(                                            \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       ::pten::Kernel*);                                                     \
@@ -268,60 +268,76 @@ struct KernelRegistrar {
       ::pten::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),             \
+                           meta_kernel_fn,                               \
+                           backend,                                      \
+                           cpp_dtype,                                    \
                            __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                               \
+  (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__)
 
-#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
-#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, cpp_dtype, ...)        \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, cpp_dtype, ...)       \
-  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>
+#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)    \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                  \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
+      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
 
 #define PT_KERNEL_REGISTRAR_INIT(                                              \
     kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
@@ -373,10 +389,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                              \
                                     backend,                                  \
@@ -393,10 +410,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -419,10 +437,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -445,10 +464,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -471,10 +491,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -497,10 +518,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -523,10 +545,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -549,10 +572,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -575,10 +599,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -601,10 +626,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                          \
                                         backend,                              \
                                         layout,                               \
@@ -627,10 +653,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -653,10 +680,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -679,10 +707,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -705,10 +734,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -731,10 +761,11 @@ struct KernelRegistrar {
       DATALAYOUT(layout),                                                     \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
       ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,                                \
+          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
       args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>),                                   \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype>));                         \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
+      PT_VARIADIC_KERNEL(                                                     \
+          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                         \
                                          backend,                             \
                                          layout,                              \
@@ -743,41 +774,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,                      \
                                          __VA_ARGS__))
 
-/** PT_REGISTER_NO_TEMPLATE_KERNEL
- *
- * Basic Kernel register marco, used to register a no template argument kernel
- * function, pass in the complete function pointe of the kernel, this
- * registration macro will not do automatic template instantiation.
- *
- * Note: developer maybe register 2 kernel with same name, backend and diff
- * layout, so the layout also need to be a part of symbol var name. If developer
- * register 2 kernel with same name, backend, layout and diff dtype, he should
- * use another register marco PT_REGISTER_KERNEL.
- *
- * TODO(chenweihang): remove this marco later
- */
-#define PT_REGISTER_NO_TEMPLATE_KERNEL(                                      \
-    kernel_name, backend, layout, kernel_fn, dtype)                          \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(  \
-      ::pten::Kernel*);                                                      \
-  static const ::pten::KernelRegistrar                                       \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout(                  \
-          #kernel_name,                                                      \
-          BACKEND(backend),                                                  \
-          DATALAYOUT(layout),                                                \
-          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
-          &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,     \
-          PT_KERNEL(kernel_fn),                                              \
-          PT_VARIADIC_KERNEL(kernel_fn));                                    \
-  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() {          \
-    return 0;                                                                \
-  }                                                                          \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(         \
-      ::pten::Kernel* kernel)
-
 /** PT_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
@@ -832,558 +828,6 @@ struct KernelRegistrar {
       ::pten::Kernel* kernel)
 #endif
 
-/** PT_REGISTER_CTX_KERNEL
- *
- * Used for kernel registration with device context and data type as
- * template parameter.
- */
-#define PT_REGISTER_CTX_KERNEL(                                                \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)              \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      pt_register_tp_ctx_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_CTX_KERNEL must be called in global namespace.");           \
-  _PT_REGISTER_2TA_KERNEL(                                                     \
-      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
-#ifndef _WIN32
-#define _PT_REGISTER_2TA_KERNEL(                                             \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)            \
-  PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__); \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(  \
-      ::pten::Kernel*);                                                      \
-  PT_KERNEL_REGISTRAR_INIT2(                                                 \
-      kernel_name,                                                           \
-      backend,                                                               \
-      layout,                                                                \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,         \
-      meta_kernel_fn,                                                        \
-      cpp_dtype,                                                             \
-      __VA_ARGS__);                                                          \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(         \
-      ::pten::Kernel* kernel)
-#else
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)           \
-  static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
-      ::pten::Kernel*);                                                     \
-  PT_KERNEL_REGISTRAR_INIT2(                                                \
-      kernel_name,                                                          \
-      backend,                                                              \
-      layout,                                                               \
-      &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
-      meta_kernel_fn,                                                       \
-      cpp_dtype,                                                            \
-      __VA_ARGS__);                                                         \
-  void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
-      ::pten::Kernel* kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION2(PT_NARGS(cpp_dtype, __VA_ARGS__),             \
-                            meta_kernel_fn,                               \
-                            backend,                                      \
-                            cpp_dtype,                                    \
-                            __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION2(N, meta_kernel_fn, backend, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION2_, N)                               \
-  (meta_kernel_fn, backend, cpp_dtype, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>
-#define _PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_1(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_2(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_3(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_4(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_5(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_6(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_7(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_8(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_9(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_10(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_11(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_12(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_13(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION2_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::pten::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION2_14(meta_kernel_fn, backend, __VA_ARGS__))
-
-#define PT_KERNEL_REGISTRAR_INIT2(                                             \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_REGISTRAR_INIT2(PT_NARGS(cpp_dtype, __VA_ARGS__),                 \
-                             kernel_name,                                      \
-                             backend,                                          \
-                             layout,                                           \
-                             args_def_fn,                                      \
-                             meta_kernel_fn,                                   \
-                             cpp_dtype,                                        \
-                             __VA_ARGS__)
-
-// clang-format off
-
-/* The =pre-commit always treats this macro into the wrong format,
-  and multi-line macros cannot be skipped with NOLINT.*/
-#define _PT_KERNEL_REGISTRAR_INIT2(N,              \
-                                  kernel_name,    \
-                                  backend,        \
-                                  layout,         \
-                                  args_def_fn,    \
-                                  meta_kernel_fn, \
-                                  cpp_dtype,      \
-                                  ...)            \
-  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT2_, N) ( \
-    kernel_name,                                  \
-    backend,                                      \
-    layout,                                       \
-    PT_ID,                                        \
-    args_def_fn,                                  \
-    meta_kernel_fn,                               \
-    cpp_dtype,                                    \
-    __VA_ARGS__)
-
-// clang-format on
-
-#define _PT_KERNEL_REGISTRAR_INIT2_1(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT2_2(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_1(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_3(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_2(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_4(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_3(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_5(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_4(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_6(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_5(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_7(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_6(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_8(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_7(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_9(kernel_name,                             \
-                                     backend,                                 \
-                                     layout,                                  \
-                                     registrar_id,                            \
-                                     args_def_fn,                             \
-                                     meta_kernel_fn,                          \
-                                     cpp_dtype,                               \
-                                     ...)                                     \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_8(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_10(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_9(kernel_name,                         \
-                                         backend,                             \
-                                         layout,                              \
-                                         PT_ID,                               \
-                                         args_def_fn,                         \
-                                         meta_kernel_fn,                      \
-                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_11(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_10(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_12(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_11(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_13(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_12(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_14(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_13(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT2_15(kernel_name,                            \
-                                      backend,                                \
-                                      layout,                                 \
-                                      registrar_id,                           \
-                                      args_def_fn,                            \
-                                      meta_kernel_fn,                         \
-                                      cpp_dtype,                              \
-                                      ...)                                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(                        \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
-      #kernel_name,                                                           \
-      BACKEND(backend),                                                       \
-      DATALAYOUT(layout),                                                     \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
-      ::pten::KernelArgsParseFunctor<decltype(                                \
-          &meta_kernel_fn<cpp_dtype, ::pten::backend##Context>)>::Parse,      \
-      args_def_fn,                                                            \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::pten::backend##Context>),         \
-      PT_VARIADIC_KERNEL(                                                     \
-          meta_kernel_fn<cpp_dtype, ::pten::backend##Context>));              \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT2_14(kernel_name,                        \
-                                          backend,                            \
-                                          layout,                             \
-                                          PT_ID,                              \
-                                          args_def_fn,                        \
-                                          meta_kernel_fn,                     \
-                                          __VA_ARGS__))
-
 /** PT_DECLARE_KERNEL
  *
  * Used to export the symbols of the file where the kernel is located,
diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc
index c6736cdd1bcf0..a0006f49a2b38 100644
--- a/paddle/pten/kernels/cpu/cast_kernel.cc
+++ b/paddle/pten/kernels/cpu/cast_kernel.cc
@@ -58,20 +58,20 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(cast,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::CastKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       int16_t,
-                       bool,
-                       uint8_t,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {
+PT_REGISTER_KERNEL(cast,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::CastKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   int16_t,
+                   bool,
+                   uint8_t,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc
index 10e7e684db3c1..59a7577153a61 100644
--- a/paddle/pten/kernels/cpu/complex_kernel.cc
+++ b/paddle/pten/kernels/cpu/complex_kernel.cc
@@ -21,13 +21,13 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(conj,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::ConjKernel,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>,
-                       float,
-                       double,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(conj,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ConjKernel,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
index c9d5c35e134c8..ed927f820f0e7 100644
--- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc
@@ -20,13 +20,13 @@
 
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(dot_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DotGradKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(dot_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DotGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc
index 72e9e28907f90..0baf9ba0a8bdd 100644
--- a/paddle/pten/kernels/cpu/dot_kernel.cc
+++ b/paddle/pten/kernels/cpu/dot_kernel.cc
@@ -49,13 +49,13 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(dot,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DotKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
+PT_REGISTER_KERNEL(dot,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DotKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc
index 1ae8001d79dc7..919471d86ac53 100644
--- a/paddle/pten/kernels/cpu/full_kernel.cc
+++ b/paddle/pten/kernels/cpu/full_kernel.cc
@@ -18,29 +18,29 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/full_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(full,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FullKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(full,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FullKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(full_like,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FullLikeKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16) {}
+PT_REGISTER_KERNEL(full_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FullLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index be0d52355bce6..83388d0d9a80f 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -118,60 +118,60 @@ using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
-PT_REGISTER_CTX_KERNEL(add,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::AddKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(subtract,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::SubtractKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(divide,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::DivideKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(multiply,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MultiplyKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(sum,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::SumKernel,
-                       bool,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {
+PT_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
index 5a8abb6701b0e..4738e21573194 100644
--- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc
@@ -19,29 +19,29 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_double_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulDoubleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulTripleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_double_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulDoubleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_triple_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulTripleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc
index edba402ec1d84..f749e9cb27979 100644
--- a/paddle/pten/kernels/cpu/matmul_kernel.cc
+++ b/paddle/pten/kernels/cpu/matmul_kernel.cc
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::MatmulKernel,
-                       float,
-                       double,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MatmulKernel,
+                   float,
+                   double,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/scale_kernel.cc b/paddle/pten/kernels/cpu/scale_kernel.cc
index 0582fb87b4457..7088bba01aa78 100644
--- a/paddle/pten/kernels/cpu/scale_kernel.cc
+++ b/paddle/pten/kernels/cpu/scale_kernel.cc
@@ -51,15 +51,15 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(scale,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::ScaleKernel,
-                       float,
-                       double,
-                       paddle::platform::bfloat16,
-                       uint8_t,
-                       int8_t,
-                       int16_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(scale,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ScaleKernel,
+                   float,
+                   double,
+                   paddle::platform::bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/cpu/sign_kernel.cc b/paddle/pten/kernels/cpu/sign_kernel.cc
index a7b62822d6e0f..25fa2bb5fe4ef 100644
--- a/paddle/pten/kernels/cpu/sign_kernel.cc
+++ b/paddle/pten/kernels/cpu/sign_kernel.cc
@@ -21,5 +21,4 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/bfloat16.h"
 
-PT_REGISTER_CTX_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {
-}
+PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, pten::SignKernel, float, double) {}
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index 2dd55a13e38e5..eb67ed6655f47 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -34,66 +34,66 @@ void EmptyLikeKernel(const Context& dev_ctx, DenseTensor* out) {
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(empty,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::EmptyKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::EmptyKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(empty_like,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::EmptyLikeKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::bfloat16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::EmptyLikeKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(empty,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::EmptyKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::EmptyKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(empty_like,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::EmptyLikeKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(empty_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::EmptyLikeKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 #endif
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
index d6aea31748d6c..45f3c6558d9c8 100644
--- a/paddle/pten/kernels/flatten_grad_kernel.cc
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -33,41 +33,41 @@ void FlattenGradKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_CTX_KERNEL(flatten_grad,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenGradKernel,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenGradKernel,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #endif
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index b284d3690830f..9201a8df9d166 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -48,72 +48,72 @@ void FlattenWithXShape(const Context& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(flatten,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       CPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_CTX_KERNEL(flatten,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       paddle::platform::float16,
-                       double,
-                       uint8_t,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_CTX_KERNEL(flatten,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenKernel,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenKernel,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 
-PT_REGISTER_CTX_KERNEL(flatten_with_xshape,
-                       XPU,
-                       ALL_LAYOUT,
-                       pten::FlattenWithXShape,
-                       float,
-                       paddle::platform::float16,
-                       int8_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(flatten_with_xshape,
+                   XPU,
+                   ALL_LAYOUT,
+                   pten::FlattenWithXShape,
+                   float,
+                   paddle::platform::float16,
+                   int8_t,
+                   int,
+                   int64_t) {}
 #endif
diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu
index 0bbe7a3a132d1..2f91c94ba5f75 100644
--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -60,24 +60,24 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace pten
 
-#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...)     \
-  PT_REGISTER_CTX_KERNEL(cast,                              \
-                         GPU,                               \
-                         ALL_LAYOUT,                        \
-                         pten::CastKernel,                  \
-                         float,                             \
-                         double,                            \
-                         int,                               \
-                         int64_t,                           \
-                         int16_t,                           \
-                         bool,                              \
-                         uint8_t,                           \
-                         paddle::platform::float16,         \
-                         paddle::platform::complex<float>,  \
-                         paddle::platform::complex<double>, \
-                         ##__VA_ARGS__) {                   \
-    kernel->OutputAt(0).SetDataType(                        \
-        paddle::experimental::DataType::UNDEFINED);         \
+#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
+  PT_REGISTER_KERNEL(cast,                              \
+                     GPU,                               \
+                     ALL_LAYOUT,                        \
+                     pten::CastKernel,                  \
+                     float,                             \
+                     double,                            \
+                     int,                               \
+                     int64_t,                           \
+                     int16_t,                           \
+                     bool,                              \
+                     uint8_t,                           \
+                     paddle::platform::float16,         \
+                     paddle::platform::complex<float>,  \
+                     paddle::platform::complex<double>, \
+                     ##__VA_ARGS__) {                   \
+    kernel->OutputAt(0).SetDataType(                    \
+        paddle::experimental::DataType::UNDEFINED);     \
   }
 
 #if !defined(PADDLE_WITH_HIP)
diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu
index 02f050f5bc838..1c82077793e0a 100644
--- a/paddle/pten/kernels/gpu/complex_kernel.cu
+++ b/paddle/pten/kernels/gpu/complex_kernel.cu
@@ -21,14 +21,14 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(conj,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::ConjKernel,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>,
-                       float,
-                       double,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(conj,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ConjKernel,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
index 42af96f7c7265..4b0d7fed4c9fd 100644
--- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu
@@ -20,13 +20,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/complex.h"
 
-PT_REGISTER_CTX_KERNEL(dot_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DotGradKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(dot_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DotGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu
index 08d8f83c408de..18bab5c15a058 100644
--- a/paddle/pten/kernels/gpu/dot_kernel.cu
+++ b/paddle/pten/kernels/gpu/dot_kernel.cu
@@ -52,13 +52,13 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(dot,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DotKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {}
+PT_REGISTER_KERNEL(dot,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DotKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu
index ae1f8529db3de..2f6346daa888f 100644
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -18,28 +18,28 @@ limitations under the License. */
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/impl/full_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(full,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FullKernel,
-                       float,
-                       double,
-                       uint8_t,
-                       int16_t,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(full,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FullKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
 
-PT_REGISTER_CTX_KERNEL(full_like,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::FullLikeKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       paddle::platform::float16) {}
+PT_REGISTER_KERNEL(full_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::FullLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 557080638038d..1fd085ab5fe40 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -110,64 +110,64 @@ using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
-PT_REGISTER_CTX_KERNEL(add,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::AddKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(subtract,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::SubtractKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(divide,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::DivideKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(multiply,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MultiplyKernel,
-                       float,
-                       double,
-                       int,
-                       int64_t,
-                       bool,
-                       float16,
-                       complex64,
-                       complex128) {}
-PT_REGISTER_CTX_KERNEL(sum,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::SumKernel,
-                       bool,
-                       float,
-                       double,
-                       float16,
-                       int,
-                       int64_t,
-                       complex64,
-                       complex128) {
+PT_REGISTER_KERNEL(add,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
index f20c3f82c9262..993b17f6b8ed0 100644
--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -19,32 +19,32 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_double_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulDoubleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
-
-PT_REGISTER_CTX_KERNEL(matmul_triple_grad,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulTripleGradKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulDoubleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+
+PT_REGISTER_KERNEL(matmul_triple_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulTripleGradKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
index debda455818a9..a3ab88913a3b6 100644
--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -20,12 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_CTX_KERNEL(matmul,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::MatmulKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       paddle::platform::complex<float>,
-                       paddle::platform::complex<double>) {}
+PT_REGISTER_KERNEL(matmul,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MatmulKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index ff7e2a6ed284c..4d63701413cd6 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -64,15 +64,15 @@ void ScaleKernel(const ContextT& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_CTX_KERNEL(scale,
-                       GPU,
-                       ALL_LAYOUT,
-                       pten::ScaleKernel,
-                       float,
-                       double,
-                       paddle::platform::float16,
-                       uint8_t,
-                       int8_t,
-                       int16_t,
-                       int,
-                       int64_t) {}
+PT_REGISTER_KERNEL(scale,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ScaleKernel,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu
index e7eb7e46861c8..16356507dc8ea 100644
--- a/paddle/pten/kernels/gpu/sign_kernel.cu
+++ b/paddle/pten/kernels/gpu/sign_kernel.cu
@@ -23,5 +23,5 @@ limitations under the License. */
 
 using float16 = paddle::platform::float16;
 
-PT_REGISTER_CTX_KERNEL(
+PT_REGISTER_KERNEL(
     sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {}

From 9ff989aeae54472f766bc6ffef8a13111ca8da51 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Fri, 14 Jan 2022 11:26:01 +0800
Subject: [PATCH 16/24] remove interface: DenseTensor::release, test=develop
 (#38937)

---
 paddle/fluid/pybind/eager_method.cc         | 2 +-
 paddle/pten/api/lib/utils/tensor_utils.cc   | 6 ++----
 paddle/pten/core/dense_tensor.h             | 6 ------
 paddle/pten/tests/core/test_dense_tensor.cc | 7 -------
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index a8c1da2a8b866..46b56f27ff98e 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -189,7 +189,7 @@ static PyObject* eager_tensor__clear_gradient(EagerTensorObject* self,
             << " is initialized, will be released.";
     auto dense_tensor =
         std::dynamic_pointer_cast<pten::DenseTensor>(grad->impl());
-    dense_tensor->release();
+    dense_tensor->MoveMemoryHolder();
   }
   Py_INCREF(Py_None);
   return Py_None;
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 0b6cb8d95cc1a..53d641896e43f 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -306,10 +306,8 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
           "The destination Tensor is nullptr when move storage."));
   dst->Resize(src->dims());
   dst->set_type(pten::TransToProtoVarType(src->dtype()));
-  auto storage = src->release();
-  std::shared_ptr<pten::Allocation> holder(
-      new TensorStorage(std::move(storage)));
-  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
+  auto storage = src->MoveMemoryHolder();
+  dst->ResetHolderWithType(storage, pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
 }
 
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 1802a2461158f..4f25fc296724c 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -172,12 +172,6 @@ class DenseTensor : public TensorBase,
   /// \return The actual storage size occupied by tensor.
   size_t capacity() const { return storage_->size(); }
 
-  /// \brief Release the storage area for other purposes. Because of the
-  /// destruction of encapsulation, we do not support two dense tensors directly
-  /// sharing the same intrusive pointer.
-  /// \return The rvalue of instrusize pointer releated to the released storage.
-  intrusive_ptr<Storage> release() { return std::move(storage_); }
-
   /// \brief Get the mutable data pointer value of type T.
   /// Memory allocation may occur when calling this interface:
   /// 1. When the storage size is not enough to meet the current shape of the
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index c6db228c2b757..8277c0d8dadb7 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -116,9 +116,6 @@ TEST(dense_tensor, resize) {
   CHECK_EQ(tensor_0.capacity(), 6u);
   tensor_0.mutable_data<int8_t>();
   CHECK_EQ(tensor_0.capacity(), 6u);
-
-  auto storage = tensor_0.release();
-  CHECK_EQ(storage->size(), 6u);
 }
 
 TEST(dense_tensor, shallow_copy) {
@@ -133,10 +130,6 @@ TEST(dense_tensor, shallow_copy) {
 
   DenseTensor tensor_1(tensor_0);
   CHECK(tensor_0.meta() == tensor_1.meta());
-
-  // Copy constructor: Now shares the underlying shared_ptr<Allocation> instead
-  // of Storage
-  CHECK(tensor_0.release() != tensor_1.release());
 }
 
 }  // namespace tests

From 9e0686ed45f79bbe6a5434bf453509cab0b630ea Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Fri, 14 Jan 2022 11:29:37 +0800
Subject: [PATCH 17/24] fix bug of -DPADDLE_WITH_SSE3 not set when WITH_AVX AND
 AVX_FOUND even SSE3_FOUND (#38931)

---
 cmake/configure.cmake | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 32ba2ff3ac627..88e8dde8addbc 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -31,10 +31,12 @@ endif(NOT WITH_PROFILER)
 if(WITH_AVX AND AVX_FOUND)
     set(SIMD_FLAG ${AVX_FLAG})
     add_definitions(-DPADDLE_WITH_AVX)
-elseif(SSE3_FOUND)
-    if(NOT WIN32)
-        set(SIMD_FLAG ${SSE3_FLAG})
-    endif()
+elseif(SSE3_FOUND AND NOT WIN32)
+    set(SIMD_FLAG ${SSE3_FLAG})
+endif()
+
+if (SSE3_FOUND)
+    # TODO: Runtime detection should be used here.
     add_definitions(-DPADDLE_WITH_SSE3)
 endif()
 

From 7f8d5bc8f02d10db46cce9a975db584528742ed7 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Fri, 14 Jan 2022 11:37:26 +0800
Subject: [PATCH 18/24] [MLU]Add mean and reduce_mean op (#38872)

* [MLU]: add mean and reduce mean op

* [MLU]add mlu pytest dir in CMakeLists.txt

* [MLU]fix tensor data

* [MLU]fix TensorToPyArray and license
---
 paddle/fluid/framework/tensor_util.cc         |  40 +++-
 paddle/fluid/memory/detail/buddy_allocator.cc |   5 +-
 paddle/fluid/memory/memcpy.cc                 |  10 +
 paddle/fluid/operators/mean_op_mlu.cc         | 127 ++++++++++++
 paddle/fluid/operators/mlu/mlu_baseop.h       |  15 +-
 .../reduce_ops/reduce_mean_op_mlu.cc          | 127 ++++++++++++
 paddle/fluid/pybind/tensor_py.h               |  28 ++-
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +
 .../fluid/tests/unittests/mlu/CMakeLists.txt  |   9 +
 .../tests/unittests/mlu/test_mean_op_mlu.py   |  83 ++++++++
 .../unittests/mlu/test_reduce_mean_op_mlu.py  | 185 ++++++++++++++++++
 .../tests/unittests/mlu/test_relu_op_mlu.py   | 166 ++++++++++++++++
 .../paddle/fluid/tests/unittests/op_test.py   |  11 +-
 13 files changed, 796 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/operators/mean_op_mlu.cc
 create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 5fd581220097b..724e3cc1e2ee8 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -396,7 +396,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
                     TENSOR* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) {
+  if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
+      platform::is_mlu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
   } else {
     dev_ctx = pool.Get(src.place());
@@ -1048,6 +1049,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_mlu_place(tensor.place())) {
+#ifdef PADDLE_WITH_MLU
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& mlu_dev_ctx =
+          static_cast<const platform::MLUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::MLUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     mlu_dev_ctx.stream());
+        mlu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "MLUPlace is not supported when not compiled with MLU"));
 #endif
     } else if (platform::is_npu_place(tensor.place())) {
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -1127,9 +1151,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_mlu_place(dev_ctx.GetPlace()) ||
         platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -1148,6 +1174,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "MLUPlace is not supported when not compiled with MLU"));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
@@ -1192,9 +1221,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_mlu_place(dev_ctx.GetPlace()) ||
         platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) ||  \
+    defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -1213,6 +1244,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "MLUPlace is not supported when not compiled with MLU"));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "NPUPlace is not supported when not compiled with NPU"));
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 96fcd6254d885..b02fb6642be3f 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -231,9 +231,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
                                       &platform::NPUReallocSize, request_bytes);
 #elif defined(PADDLE_WITH_MLU)
-  allocate_bytes =
-      DeviceAllocateSize(&platform::MLUInitAllocSize(),
-                         &platform::MLUReallocSize(), request_bytes);
+  allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
+                                      &platform::MLUReallocSize, request_bytes);
 #endif
 
   // Allocate a new block
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index e6aed2c90dace..153e19a9f1450 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -508,6 +508,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
     platform::MLUMemcpyD2HAsync(dst, src, num, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
     platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU");
@@ -530,6 +533,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
     platform::MLUMemcpyH2DAsync(dst, src, num, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
     VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
             << " to " << dst_place;
     platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU");
@@ -554,6 +560,10 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
           "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
       platform::MLUMemcpyD2DAsync(dst, src, num, stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
+
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
       platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU");
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
new file mode 100644
index 0000000000000..9862c2bd95256
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MeanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(context.GetPlace());
+    auto numel = input->numel();
+    auto rank = input->dims().size();
+    auto place = context.GetPlace();
+    auto stream = context.template device_context<MLUDeviceContext>().stream();
+
+    if (rank == 0) {  // scalar
+      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
+      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
+                   stream);
+      return;
+    }
+
+    std::vector<int> reduce_dims;
+    reduce_dims.reserve(rank);
+    for (decltype(rank) i = 0; i < rank; ++i) {
+      reduce_dims.push_back(i);
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->type()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->type()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(),
+                    reinterpret_cast<const void*>(in_data), 0 /*indices_size*/,
+                    nullptr, nullptr, output_desc.get(),
+                    reinterpret_cast<void*>(out_data));
+  }
+};
+
+template <typename T>
+class MeanMLUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(output_grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          output_grad->numel()));
+    auto input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    auto in_data = output_grad->data<T>();
+    auto numel = input_grad->numel();
+    auto rank = input_grad->dims().size();
+    auto out_data = input_grad->data<T>();
+    auto place = context.GetPlace();
+    auto stream = context.template device_context<MLUDeviceContext>().stream();
+
+    if (rank == 0) {  // scalar
+      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
+      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
+                   stream);
+      return;
+    }
+
+    // means
+    Tensor mean_var(output_grad->type());
+    mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
+    MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY,
+                                    ToCnnlDataType(mean_var.type()));
+    auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
+    MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var));
+
+    // means mul output_grad
+    MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(output_grad->type()));
+    MLUCnnlTensorDesc out_desc(*input_grad, CNNL_LAYOUT_ARRAY,
+                               ToCnnlDataType(input_grad->type()));
+
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(context, op_tensor_desc.get(), in_desc.get(),
+                      reinterpret_cast<const void*>(in_data),
+                      mean_var_desc.get(), GetBasePtr(&mean_var),
+                      out_desc.get(), reinterpret_cast<void*>(out_data),
+                      ToCnnlDataType<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(mean, ops::MeanMLUKernel<float>,
+                       ops::MeanMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(mean_grad, ops::MeanMLUGradKernel<float>,
+                       ops::MeanMLUGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index ab398a92c2972..8082c45d14b95 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -45,12 +45,22 @@ enum MLULogicMethod {
   CNNL_LOGIC_OP_OR = 7,
 };
 
+inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
+
+inline void* GetBasePtr(Tensor* t) { return t->data(); }
+
 template <typename T>
 inline cnnlDataType_t ToCnnlDataType(const T& t) {
   auto type = framework::ToDataType(t);
   return ToCnnlDataType(type);
 }
 
+template <typename T>
+inline cnnlDataType_t ToCnnlDataType() {
+  auto type = framework::ToDataType(std::type_index(typeid(T)));
+  return ToCnnlDataType(type);
+}
+
 template <>
 inline cnnlDataType_t ToCnnlDataType(const framework::proto::VarType::Type& t) {
   cnnlDataType_t type = CNNL_DTYPE_FLOAT;
@@ -89,11 +99,12 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
+inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
 }
 
-static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) {
+inline static const MLUDeviceContext& GetDevCtxFromCTX(
+    const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>();
 }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
new file mode 100644
index 0000000000000..ef7e9940f0590
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_mlu.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+    const auto& input_dim_size = input->dims().size();
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      for (size_t i = 0; i < input_dims.size(); i++) {
+        reduce_dims.push_back(static_cast<int>(i));
+      }
+    } else {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        if (dims[i] < 0) {
+          reduce_dims.push_back(dims[i] + input_dim_size);
+        } else {
+          reduce_dims.push_back(dims[i]);
+        }
+      }
+    }
+
+    MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
+                                 ToCnnlDataType(input->type()));
+    MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
+                                  ToCnnlDataType(output->type()));
+
+    MLUCnnlReduceDesc reduction_desc(
+        reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
+        CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+    MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
+                    nullptr, input_desc.get(), GetBasePtr(input),
+                    0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
+                    GetBasePtr(output));
+  }
+};
+
+template <typename T>
+class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    auto input_dims = framework::vectorize(input->dims());
+
+    int reduce_numel = 1;
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < input_dims.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+    for (auto& d : reduce_dims) {
+      if (d < 0) {
+        d = d + input_dims.size();
+      }
+      reduce_numel *= input_dims[d];
+    }
+
+    Tensor tmp_output_grad(output_grad->type());
+    auto tmp_output_dims = input_dims;
+    for (auto d : reduce_dims) {
+      tmp_output_dims[d] = 1;
+    }
+    tmp_output_grad.ShareDataWith(*output_grad);
+    tmp_output_grad.Resize(framework::make_ddim(tmp_output_dims));
+
+    MLUCnnlTensorDesc output_grad_desc(tmp_output_grad, CNNL_LAYOUT_ARRAY,
+                                       ToCnnlDataType(tmp_output_grad.type()));
+    MLUCnnlTensorDesc input_grad_desc(*input_grad, CNNL_LAYOUT_ARRAY,
+                                      ToCnnlDataType(input_grad->type()));
+
+    auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
+    MLUCnnl::Fill(context, value, input_grad_desc.get(),
+                  GetBasePtr(input_grad));
+
+    MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                       CNNL_NOT_PROPAGATE_NAN);
+
+    MLUCnnl::OpTensor(context, op_tensor_desc.get(), output_grad_desc.get(),
+                      GetBasePtr(&tmp_output_grad), input_grad_desc.get(),
+                      GetBasePtr(input_grad), input_grad_desc.get(),
+                      GetBasePtr(input_grad), ToCnnlDataType<T>());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(reduce_mean, ops::ReduceMeanMLUKernel<float>,
+                       ops::ReduceMeanMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(reduce_mean_grad, ops::ReduceMeanGradMLUKernel<float>,
+                       ops::ReduceMeanGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index b31b7456ebca7..1fe6686919453 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -232,6 +232,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
     auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
     paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_mlu_place(self.place())) {
+#ifdef PADDLE_WITH_MLU
+    const T *a = self.data<T>();
+    auto p = BOOST_GET_CONST(platform::MLUPlace, self.place());
+    paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
+                         nullptr);
 #endif
   } else if (platform::is_npu_place(self.place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -267,6 +274,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
                          nullptr);
+#endif
+  } else if (platform::is_mlu_place(self->place())) {
+#ifdef PADDLE_WITH_MLU
+    auto p = BOOST_GET_CONST(platform::MLUPlace, self->place());
+    T *a = self->mutable_data<T>(p);
+    paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
+                         nullptr);
 #endif
   } else if (platform::is_npu_place(self->place())) {
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -543,6 +557,11 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
 #ifdef PADDLE_WITH_XPU
     output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
                          self.type());
+#endif
+  } else if (platform::is_mlu_place(place)) {
+#ifdef PADDLE_WITH_MLU
+    output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place),
+                         self.type());
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -845,8 +864,13 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
     size_t copy_bytes = sizeof_dtype * numel;
     auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place());
-    paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p,
-                         tensor_buf_ptr, copy_bytes, nullptr);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
+    ctx.Wait();
     return py_arr;
 #else
     PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b46a10c8c79d8..67697fcfd8398 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -803,6 +803,10 @@ if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
 
+if (WITH_MLU)
+    add_subdirectory(mlu)
+endif()
+
 add_subdirectory(asp)
 
 add_subdirectory(ir)
diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
new file mode 100644
index 0000000000000..8fcd3f196dc19
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt
@@ -0,0 +1,9 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if (WITH_MLU)
+    foreach(TEST_OP ${TEST_OPS})
+        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+
+endif()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
new file mode 100644
index 0000000000000..36419327db6b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_mean_op_mlu.py
@@ -0,0 +1,83 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.place = paddle.device.MLUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([1, 100]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.place = paddle.MLUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 200]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
new file mode 100644
index 0000000000000..c0be644c79115
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_mean_op_mlu.py
@@ -0,0 +1,185 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestMeanOp(OpTest):
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestMeanOp5D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 2, 5, 6, 10)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class TestMeanOp6D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class TestMeanOp8D(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
+
+
+class Test1DReduce(TestMeanOp):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random(120).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class Test2DReduce0(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
+
+
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce0(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce1(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce2(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [-2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class Test3DReduce3(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.attrs = {'dim': [1, 2]}
+        self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].mean()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
new file mode 100644
index 0000000000000..25c50f67949e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_relu_op_mlu.py
@@ -0,0 +1,166 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestRelu(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestReluFp16(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+
+class TestReluNeg(OpTest):
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "relu"
+        self.place = paddle.MLUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
+        out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestReluNet(unittest.TestCase):
+    def _test(self, run_mlu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_mlu:
+            place = paddle.MLUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_mlu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        mlu_pred, mlu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(mlu_pred, cpu_pred))
+        self.assertTrue(np.allclose(mlu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index ec59c27558332..01d851469a8d1 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -326,6 +326,9 @@ def is_rocm_op_test():
         def is_npu_op_test():
             return hasattr(cls, "use_npu") and cls.use_npu == True
 
+        def is_mlu_op_test():
+            return hasattr(cls, "use_mlu") and cls.use_mlu == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -348,7 +351,8 @@ def is_npu_op_test():
                 and not is_xpu_op_test() \
                 and not is_mkldnn_op_test() \
                 and not is_rocm_op_test() \
-                and not is_npu_op_test():
+                and not is_npu_op_test() \
+                and not is_mlu_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
@@ -1297,7 +1301,8 @@ def find_actual(target_name, fetch_list):
         # No effect on original OpTest
         # Currently not support ParallelExecutor on XPUPlace.
         if not paddle.is_compiled_with_xpu(
-        ) and not paddle.is_compiled_with_npu():
+        ) and not paddle.is_compiled_with_npu(
+        ) and not paddle.is_compiled_with_mlu():
             self.check_inplace_output_with_place(
                 place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
@@ -1547,11 +1552,9 @@ def check_grad_with_place(self,
                 delta=numeric_grad_delta,
                 in_place=in_place) for input_to_check in inputs_to_check
         ]
-
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
                                             user_defined_grad_outputs)
-
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
         fp32_analytic_grads = []

From 556d509791b2b0a6c12781f7ecb6bbf811ee3bec Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 14 Jan 2022 11:47:16 +0800
Subject: [PATCH 19/24] refactor impl of elementwise op part2 (#38898)

---
 .../elementwise/elementwise_op_function.h     | 621 +-------------
 paddle/pten/kernels/cpu/elementwise.h         | 144 ++++
 paddle/pten/kernels/gpu/elementwise.h         | 768 ++++++++++++++++++
 3 files changed, 919 insertions(+), 614 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 626046890fb06..7cd04318d3f49 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -49,12 +49,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
-#define GetDivMod(dividend, divisor, div, mod) \
-  do {                                         \
-    const auto dividend_copy = dividend;       \
-    *div = dividend_copy / divisor;            \
-    *mod = dividend_copy % divisor;            \
-  } while (0)
 
 #define DIVUP(x, y) (((x) + (y)-1) / (y))
 
@@ -138,613 +132,11 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                       axis);
 }
 
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const framework::Tensor *x,
-                               const framework::Tensor *y, framework::Tensor *z,
-                               int *x_dims_array, int *y_dims_array,
-                               int *out_dims_array, int max_dim,
-                               const platform::CPUDeviceContext &ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  pten::CommonForwardBroadcastCPU(x, y, z, x_dims_array, y_dims_array,
-                                  out_dims_array, max_dim, ctx, func,
-                                  is_xsize_larger);
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-
-template <typename T, typename DX_OP, typename Tout = T>
-__global__ void CommonGradBroadcastCUDAKernel(
-    const int *x_strides_array, const int *y_strides_array,
-    const int *out_dims_array, const int *y_strides_order,
-    const int *y_dims_order, const T *x, const T *y, const Tout *out,
-    const Tout *dout, T *dx, int out_size, int max_dim, int thread_num,
-    DX_OP dx_op) {
-  T val(0);
-  int i = blockIdx.x;
-  int tid = threadIdx.x;
-  for (int j = tid; j < thread_num; j += blockDim.x) {
-    const int X_index = i * thread_num + j;
-    int out_index = X_index;
-    int C_index = 0;
-    int B_index = i * thread_num + j;
-    int remainder = 0;
-#pragma unroll
-    for (int d = max_dim - 1; d >= 0; --d) {
-      GetDivMod(B_index, y_dims_order[d], &B_index, &remainder);
-      C_index += remainder * y_strides_order[d];
-    }
-    int x_index = 0;
-    int y_index = 0;
-    int C_index_val = C_index;
-#pragma unroll
-    for (int d = max_dim - 1; d >= 0; --d) {
-      GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder);
-      x_index += remainder * x_strides_array[d];
-      y_index += remainder * y_strides_array[d];
-    }
-    out_index = C_index;
-    val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
-  }
-  val = paddle::platform::reduceSum(val, tid, thread_num);
-  if (threadIdx.x == 0) {
-    dx[i] = val;
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCUDA(
-    const framework::Tensor &x, const framework::Tensor &y,
-    const framework::Tensor &out, const framework::Tensor &dout,
-    framework::Tensor *dx, framework::Tensor *dy, int *x_dims_array,
-    int *y_dims_array, int *out_dims_array, int max_dim,
-    const platform::CUDADeviceContext &ctx, DX_OP dx_op, DY_OP dy_op) {
-  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-  const T *x_data = x.data<T>();
-  const T *y_data = y.data<T>();
-  const Tout *out_data = out.data<Tout>();
-  const Tout *dout_data = dout.data<Tout>();
-  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
-  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
-
-  std::vector<int> x_one_indexs;
-  std::vector<int> y_one_indexs;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] != y_dims_array[i]) {
-      if (x_dims_array[i] == 1) {
-        x_one_indexs.push_back(i);
-      }
-      if (y_dims_array[i] == 1) {
-        y_one_indexs.push_back(i);
-      }
-    }
-  }
-
-  std::vector<int> x_trans_indexs(max_dim);
-  std::vector<int> y_trans_indexs(max_dim);
-  pten::ComputeBroadcastTranspositionArray(
-      x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size());
-  pten::ComputeBroadcastTranspositionArray(
-      y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size());
-
-  // compute array stride for cuda kernel;
-  // e.g. x.dims=[2,3,4], x_stride=[12,4,1]
-  std::vector<int> x_strides_array(max_dim);
-  std::vector<int> y_strides_array(max_dim);
-  std::vector<int> out_strides_array(max_dim);
-  int x_stride = 1;
-  int y_stride = 1;
-  int z_stride = 1;
-  for (int i = max_dim - 1; i >= 0; i--) {
-    x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride;
-    y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride;
-    out_strides_array[i] = z_stride;
-    x_stride *= x_dims_array[i];
-    y_stride *= y_dims_array[i];
-    z_stride *= out_dims_array[i];
-  }
-
-  std::vector<int> x_strides_order(max_dim);
-  std::vector<int> y_strides_order(max_dim);
-  std::vector<int> x_dims_order(max_dim);
-  std::vector<int> y_dims_order(max_dim);
-  for (int i = 0; i < max_dim; ++i) {
-    x_strides_order[i] = out_strides_array[x_trans_indexs[i]];
-    y_strides_order[i] = out_strides_array[y_trans_indexs[i]];
-    x_dims_order[i] = out_dims_array[x_trans_indexs[i]];
-    y_dims_order[i] = out_dims_array[y_trans_indexs[i]];
-  }
-  std::vector<int> x_broadcast_pos;
-  std::vector<int> y_broadcast_pos;
-
-  int bytes = max_dim * sizeof(int);
-
-  for (int i = 0; i < max_dim; ++i) {
-    if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) {
-      x_broadcast_pos.emplace_back(i);
-    }
-    if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) {
-      y_broadcast_pos.emplace_back(i);
-    }
-  }
-
-  auto stream = ctx.stream();
-  bool can_split_x = false;
-  bool can_split_y = false;
-
-  auto FastCommonCUDAF = [&](const std::vector<int> &broadcast_pos, bool is_y) {
-    int h =
-        std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(),
-                        1, std::multiplies<int>());
-    int w =
-        std::accumulate(out_dims_array + broadcast_pos.size(),
-                        out_dims_array + max_dim, 1, std::multiplies<int>());
-
-    VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h
-            << " is_y:" << is_y;
-
-    int split_h;
-    int split_w;
-    int kh = h;
-    int kw = w;
-
-    if (is_y) {
-      split_h =
-          std::accumulate(x_dims_array, x_dims_array + broadcast_pos.size(), 1,
-                          std::multiplies<int>());
-      split_w =
-          std::accumulate(x_dims_array + broadcast_pos.size(),
-                          x_dims_array + max_dim, 1, std::multiplies<int>());
-
-    } else {
-      split_h =
-          std::accumulate(y_dims_array, y_dims_array + broadcast_pos.size(), 1,
-                          std::multiplies<int>());
-      split_w =
-          std::accumulate(y_dims_array + broadcast_pos.size(),
-                          y_dims_array + max_dim, 1, std::multiplies<int>());
-    }
-
-    if (h > split_h) kh = split_h;
-    if (w > split_w) kw = split_w;
-
-    if (is_y) {
-      if (w < 16 || h < 16) {
-        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-        int grid_size = w;
-        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                                     stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
-            is_y);
-      } else {
-        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
-                                                        0, stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dy_op, dy_data, kh, kw,
-            is_y);
-      }
-    } else {
-      if (w < 16 || h < 16) {
-        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-        int grid_size = w;
-        pten::CommonGradBroadcast1CUDAKernelHeight<<<grid_size, block_size, 0,
-                                                     stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
-            is_y);
-      } else {
-        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-        pten::FastCommonGradBroadcastCUDAKernelHeight<<<grid_size, block_size,
-                                                        0, stream>>>(
-            x_data, y_data, out_data, dout_data, h, w, dx_op, dx_data, kh, kw,
-            is_y);
-      }
-    }
-  };
-
-  auto FastBroadCastHeightCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                      bool x_large) {
-    int h =
-        std::accumulate(out_dims_array, out_dims_array + broadcast_pos.size(),
-                        1, std::multiplies<int>());
-    int w =
-        std::accumulate(out_dims_array + broadcast_pos.size(),
-                        out_dims_array + max_dim, 1, std::multiplies<int>());
-
-    VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h;
-
-    if (w < 16 || h < 16) {
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
-      int grid_size = w;
-      pten::ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
-                                               stream>>>(
-          x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
-          dx_data, dy_data);
-    } else {
-      dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
-      int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
-      pten::FastElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, h, w, x_large, dx_op, dy_op,
-          dx_data, dy_data);
-    }
-  };
-
-  auto FastBroadCastAllCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                   int max_dim, bool is_x_large) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1,
-                              std::multiplies<int>());
-    int mid = 1;
-    int post = 1;
-
-    if (broadcast_pos.size() == 1) {
-      mid = out_dims_array[axis];
-      post =
-          std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim,
-                          1, std::multiplies<int>());
-    } else {
-      mid = std::accumulate(out_dims_array + axis,
-                            out_dims_array + broadcast_pos.back() + 1, 1,
-                            std::multiplies<int>());
-      post =
-          std::accumulate(out_dims_array + broadcast_pos.back() + 1,
-                          out_dims_array + max_dim, 1, std::multiplies<int>());
-    }
-
-    VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-
-    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-    int grid_size = pre * post;
-
-    pten::FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0,
-                                                 stream>>>(
-        x_data, y_data, out_data, dout_data, pre, mid, post, is_x_large, dx_op,
-        dy_op, dx_data, dy_data);
-  };
-
-  auto FastBroadCastOneCUDAF = [&](const std::vector<int> &broadcast_pos,
-                                   int max_dim, bool is_x) {
-    int axis = broadcast_pos[0];
-    int pre = std::accumulate(out_dims_array, out_dims_array + axis, 1,
-                              std::multiplies<int>());
-    int mid = out_dims_array[axis];
-    int post =
-        std::accumulate(out_dims_array + axis + 1, out_dims_array + max_dim, 1,
-                        std::multiplies<int>());
-
-    int k_pre;
-    int k_mid;
-    int k_post;
-
-    if (is_x) {
-      k_pre = std::accumulate(y_dims_array, y_dims_array + axis, 1,
-                              std::multiplies<int>());
-      k_mid = y_dims_array[axis];
-      k_post = std::accumulate(y_dims_array + axis + 1, y_dims_array + max_dim,
-                               1, std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      int grid_size = pre * post;
-      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
-      // size.
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
-          k_post, true, dx_op, dx_data);
-    } else {
-      k_pre = std::accumulate(x_dims_array, x_dims_array + axis, 1,
-                              std::multiplies<int>());
-      k_mid = x_dims_array[axis];
-      k_post = std::accumulate(x_dims_array + axis + 1, x_dims_array + max_dim,
-                               1, std::multiplies<int>());
-      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
-      int grid_size = pre * post;
-      if (k_pre != pre) k_pre = pre / k_pre;
-
-      pten::FastCommonGradBroadcastOneCUDAKernel<<<grid_size, block_size, 0,
-                                                   stream>>>(
-          x_data, y_data, out_data, dout_data, pre, mid, post, k_pre, k_mid,
-          k_post, false, dy_op, dy_data);
-    }
-    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
-            << " post:" << post;
-  };
-
-  // do fast elementwise if: 1. only one input need to do broadcast, we can
-  // fallback
-  // to old fast path.
-  // 2. if both x and y need broadcast, then do it one by one.
-  bool fast_broadcast = false;
-  if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
-    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
-    if (can_split_y) {
-      // only y need to do broadcast on h
-      if (y_broadcast_pos[0] == 0) {
-        FastBroadCastHeightCUDAF(y_broadcast_pos, true);
-        fast_broadcast = true;
-      }
-    } else if (y_broadcast_pos.size() == 1 ||
-               pten::CheckContiguousDims(
-                   y_broadcast_pos)) {  // for only one dim and
-                                        // contiguous broadcast.
-      // If cannot split,  which means input has 3 parts
-      FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true);
-      fast_broadcast = true;
-    }
-  } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) {
-    // only x need broadcast
-    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
-    if (can_split_x) {
-      if (x_broadcast_pos[0] == 0) {
-        FastBroadCastHeightCUDAF(x_broadcast_pos, false);
-        fast_broadcast = true;
-      }
-    } else if (x_broadcast_pos.size() == 1 ||
-               pten::CheckContiguousDims(x_broadcast_pos)) {
-      FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false);
-      fast_broadcast = true;
-    }
-  } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
-    // do x and y broadcast each.
-    can_split_y = pten::SplitDims(y_broadcast_pos, max_dim);
-    bool fast_broadcast_x = false;
-    bool fast_broadcast_y = false;
-    if (can_split_y) {
-      // begin at start.
-      if (y_broadcast_pos[0] == 0) {
-        FastCommonCUDAF(y_broadcast_pos, true);
-        fast_broadcast_y = true;
-      }
-    } else if (y_broadcast_pos.size() == 1) {
-      FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false);
-      can_split_y = true;
-      fast_broadcast_y = true;
-    }
-    can_split_x = pten::SplitDims(x_broadcast_pos, max_dim);
-    if (can_split_x) {
-      if (x_broadcast_pos[0] == 0) {
-        FastCommonCUDAF(x_broadcast_pos, false);
-        fast_broadcast_x = true;
-      }
-    } else if (x_broadcast_pos.size() == 1) {
-      FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true);
-      can_split_x = true;
-      fast_broadcast_x = true;
-    }
-    VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y
-            << " can_split_x:" << can_split_x;
-    // if both x and y into fast path then return
-    if (fast_broadcast_x && fast_broadcast_y) {
-      fast_broadcast = true;
-    }
-    if (can_split_y && can_split_x && fast_broadcast) return;
-  }
-
-  // Should remove memory copy, use reg instead.
-  if (fast_broadcast) {
-    return;
-  }
-  int x_blocks = 0;
-  int x_threads = 0;
-  pten::ComputeBroadcastKernelSize(x_dims_array, out_dims_array, &x_blocks,
-                                   &x_threads, max_dim);
-  int y_blocks = 0;
-  int y_threads = 0;
-  pten::ComputeBroadcastKernelSize(y_dims_array, out_dims_array, &y_blocks,
-                                   &y_threads, max_dim);
-
-  auto x_strides_array_tmp = memory::Alloc(ctx, bytes);
-  int *x_strides_array_gpu =
-      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
-  memory::Copy(gplace, x_strides_array_gpu, cplace, x_strides_array.data(),
-               bytes, ctx.stream());
-
-  auto y_strides_array_tmp = memory::Alloc(ctx, bytes);
-  int *y_strides_array_gpu =
-      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
-  memory::Copy(gplace, y_strides_array_gpu, cplace, y_strides_array.data(),
-               bytes, ctx.stream());
-
-  auto out_dims_array_tmp = memory::Alloc(ctx, bytes);
-  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
-  memory::Copy(gplace, out_dims_array_gpu, cplace, out_dims_array, bytes,
-               ctx.stream());
-
-  const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim,
-                                       1, std::multiplies<int>());
-  int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
-  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
-  if (dx) {
-    auto x_strides_order_tmp = memory::Alloc(ctx, bytes);
-    int *x_strides_order_gpu =
-        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
-    memory::Copy(gplace, x_strides_order_gpu, cplace, x_strides_order.data(),
-                 bytes, ctx.stream());
-
-    auto x_dims_order_tmp = memory::Alloc(ctx, bytes);
-    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
-    memory::Copy(gplace, x_dims_order_gpu, cplace, x_dims_order.data(), bytes,
-                 ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T, DX_OP, Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(
-        x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu,
-        x_strides_order_gpu, x_dims_order_gpu, x_data, y_data, out_data,
-        dout_data, dx_data, out_size, max_dim, x_threads, dx_op);
-  }
-  if (dy) {
-    auto y_strides_order_tmp = memory::Alloc(ctx, bytes);
-    int *y_strides_order_gpu =
-        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
-    memory::Copy(gplace, y_strides_order_gpu, cplace, y_strides_order.data(),
-                 bytes, ctx.stream());
-
-    auto y_dims_order_tmp = memory::Alloc(ctx, bytes);
-    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
-    memory::Copy(gplace, y_dims_order_gpu, cplace, y_dims_order.data(), bytes,
-                 ctx.stream());
-    CommonGradBroadcastCUDAKernel<
-        T, DY_OP, Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(
-        x_strides_array_gpu, y_strides_array_gpu, out_dims_array_gpu,
-        y_strides_order_gpu, y_dims_order_gpu, x_data, y_data, out_data,
-        dout_data, dy_data, out_size, max_dim, y_threads, dy_op);
-  }
-}
-
-#endif  // __NVCC__ or __HIPCC__
-
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
   return pten::funcs::trim_trailing_singular_dims(dims);
 }
 
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename Tout = T>
-void CommonElementwiseBroadcastBackward(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dims,
-    const framework::DDim &y_dims, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                         y_dims_array.data(), out_dims_array.data(), max_dim,
-                         axis);
-  // for inplace strategy. memset will make dx and dout clear and get wrong
-  // result.
-  if (dx && dx->IsSharedBufferWith(dout)) {
-    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
-  }
-
-  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << framework::make_ddim(x_dims_array)
-          << " ydim:" << framework::make_ddim(y_dims_array);
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(
-        x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
-        out_dims_array.data(), max_dim,
-        ctx.template device_context<platform::CUDADeviceContext>(), dx_op,
-        dy_op);
-#endif
-  } else {
-    pten::CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(
-        x, y, out, dout, dx, dy, x_dims_array.data(), y_dims_array.data(),
-        out_dims_array.data(), max_dim,
-        ctx.template device_context<platform::CPUDeviceContext>(), dx_op,
-        dy_op);
-  }
-}
-
-template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeWithBroadcast(
-    const framework::ExecutionContext &ctx, const framework::DDim &x_dims,
-    const framework::DDim &y_dims, const framework::Tensor &x,
-    const framework::Tensor &y, const framework::Tensor &out,
-    const framework::Tensor &dout, int axis, framework::Tensor *dx,
-    framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) {
-  bool is_xsize_larger = true;
-
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis, 0,
-      platform::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis, max_dim,
-                    platform::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim, axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post,
-                              &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post,
-                              &is_run_common_broadcast);
-  }
-  // special case for common backward implementation.
-  if (is_run_common_broadcast) {
-    CommonElementwiseBroadcastBackward<DeviceContext, T, DX_OP, DY_OP, Tout>(
-        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    return;
-  }
-  if (post == 1) {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-      pten::ElemwiseGradBroadcast1CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      pten::ElemwiseGradBroadcast1CPU(
-          x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-      pten::ElemwiseGradBroadcast2CUDA(
-          ctx.template device_context<DeviceContext>().stream(), x.data<T>(),
-          y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n, post,
-          is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-#endif
-    } else {
-      pten::ElemwiseGradBroadcast2CPU(
-          x.data<T>(), y.data<T>(), out.data<Tout>(), dout.data<Tout>(), pre, n,
-          post, is_xsize_larger, dx_op, dy_op,
-          dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
-          dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
-    }
-  }
-}
-
-template <typename Functor, typename DeviceContext, typename T,
-          typename OutType = T>
-void CommonElementwiseBroadcastForward(
-    const framework::ExecutionContext &ctx, const framework::Tensor *x,
-    const framework::Tensor *y, framework::Tensor *z,
-    const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func,
-    int axis, const bool is_xsize_larger = true) {
-  z->mutable_data<OutType>(ctx.GetPlace());
-  auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
-  auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
-  auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
-  pten::CommonElementwiseBroadcastForward(dev_ctx, *pt_x.get(), *pt_y.get(),
-                                          pt_z.get(), x_dims, y_dims, func,
-                                          axis, is_xsize_larger);
-}
-
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
           typename Tout = T>
 void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
@@ -755,14 +147,14 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                          DX_OP dx_op, DY_OP dy_op) {
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (x.dims() == y.dims()) {
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
     pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP,
                                                 Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
-    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP, Tout>(
-        ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    pten::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+        dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   }
 }
 
@@ -780,14 +172,15 @@ void ElemwiseExplicitGradCompute(const framework::ExecutionContext &ctx,
                                  DX_OP dx_op, DY_OP dy_op) {
   const framework::DDim &x_dim = x.dims();
   const framework::DDim &y_dim = y.dims();
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
   if (x.dims() == y.dims()) {
-    const auto &dev_ctx = ctx.template device_context<DeviceContext>();
     pten::funcs::ElemwiseGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP>(
         dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
         dy_op);
   } else {
-    ElemwiseGradComputeWithBroadcast<DeviceContext, T, DX_OP, DY_OP>(
-        ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op, dy_op);
+    pten::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(
+        dev_ctx, x_dim, y_dim, dout, dout, out, dout, axis, dx, dy, dx_op,
+        dy_op);
   }
 }
 
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index 97db997a16478..b448586754d60 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -549,4 +549,148 @@ static void ElemwiseGradBroadcast2CPU(const T* x,
   }
 }
 
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
+                                        const DDim& x_dims,
+                                        const DDim& y_dims,
+                                        const DenseTensor& x,
+                                        const DenseTensor& y,
+                                        const DenseTensor& out,
+                                        const DenseTensor& dout,
+                                        int axis,
+                                        DenseTensor* dx,
+                                        DenseTensor* dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << paddle::framework::make_ddim(x_dims_array)
+          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
+                                                y,
+                                                out,
+                                                dout,
+                                                dx,
+                                                dy,
+                                                x_dims_array.data(),
+                                                y_dims_array.data(),
+                                                out_dims_array.data(),
+                                                max_dim,
+                                                ctx,
+                                                dx_op,
+                                                dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
+                                      const DDim& x_dims,
+                                      const DDim& y_dims,
+                                      const DenseTensor& x,
+                                      const DenseTensor& y,
+                                      const DenseTensor& out,
+                                      const DenseTensor& dout,
+                                      int axis,
+                                      DenseTensor* dx,
+                                      DenseTensor* dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CPU(
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    ElemwiseGradBroadcast2CPU(
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        post,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index 4dfcd7a2152e0..5abc40c75d17f 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -18,7 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/function_traits.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
@@ -28,6 +31,13 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #define BLOCK_X 32
 #define BLOCK_Y 32
 
+#define GetDivMod(dividend, divisor, div, mod) \
+  do {                                         \
+    const auto dividend_copy = dividend;       \
+    *div = dividend_copy / divisor;            \
+    *mod = dividend_copy % divisor;            \
+  } while (0)
+
 namespace pten {
 
 namespace kps = paddle::operators::kernel_primitives;
@@ -1469,4 +1479,762 @@ static void ElemwiseGradBroadcast2CUDA(gpuStream_t stream,
       x, y, out, dout, pre, n, post, is_xsize_larger, dx_op, dy_op, dx, dy);
 }
 
+template <typename T, typename DX_OP, typename Tout = T>
+__global__ void CommonGradBroadcastCUDAKernel(const int *x_strides_array,
+                                              const int *y_strides_array,
+                                              const int *out_dims_array,
+                                              const int *y_strides_order,
+                                              const int *y_dims_order,
+                                              const T *x,
+                                              const T *y,
+                                              const Tout *out,
+                                              const Tout *dout,
+                                              T *dx,
+                                              int out_size,
+                                              int max_dim,
+                                              int thread_num,
+                                              DX_OP dx_op) {
+  T val(0);
+  int i = blockIdx.x;
+  int tid = threadIdx.x;
+  for (int j = tid; j < thread_num; j += blockDim.x) {
+    const int X_index = i * thread_num + j;
+    int out_index = X_index;
+    int C_index = 0;
+    int B_index = i * thread_num + j;
+    int remainder = 0;
+#pragma unroll
+    for (int d = max_dim - 1; d >= 0; --d) {
+      GetDivMod(B_index, y_dims_order[d], &B_index, &remainder);
+      C_index += remainder * y_strides_order[d];
+    }
+    int x_index = 0;
+    int y_index = 0;
+    int C_index_val = C_index;
+#pragma unroll
+    for (int d = max_dim - 1; d >= 0; --d) {
+      GetDivMod(C_index_val, out_dims_array[d], &C_index_val, &remainder);
+      x_index += remainder * x_strides_array[d];
+      y_index += remainder * y_strides_array[d];
+    }
+    out_index = C_index;
+    val += dx_op(x[x_index], y[y_index], out[out_index], dout[out_index]);
+  }
+  val = paddle::platform::reduceSum(val, tid, thread_num);
+  if (threadIdx.x == 0) {
+    dx[i] = val;
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCUDA(const DenseTensor &x,
+                             const DenseTensor &y,
+                             const DenseTensor &out,
+                             const DenseTensor &dout,
+                             DenseTensor *dx,
+                             DenseTensor *dy,
+                             int *x_dims_array,
+                             int *y_dims_array,
+                             int *out_dims_array,
+                             int max_dim,
+                             const GPUContext &ctx,
+                             DX_OP dx_op,
+                             DY_OP dy_op) {
+  const auto gplace =
+      BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx.GetPlace());
+  auto cplace = paddle::platform::CPUPlace();
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  const Tout *out_data = out.data<Tout>();
+  const Tout *dout_data = dout.data<Tout>();
+  T *dx_data = dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace());
+  T *dy_data = dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace());
+
+  std::vector<int> x_one_indexs;
+  std::vector<int> y_one_indexs;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] != y_dims_array[i]) {
+      if (x_dims_array[i] == 1) {
+        x_one_indexs.push_back(i);
+      }
+      if (y_dims_array[i] == 1) {
+        y_one_indexs.push_back(i);
+      }
+    }
+  }
+
+  std::vector<int> x_trans_indexs(max_dim);
+  std::vector<int> y_trans_indexs(max_dim);
+  ComputeBroadcastTranspositionArray(
+      x_one_indexs.data(), x_trans_indexs.data(), max_dim, x_one_indexs.size());
+  ComputeBroadcastTranspositionArray(
+      y_one_indexs.data(), y_trans_indexs.data(), max_dim, y_one_indexs.size());
+
+  // compute array stride for cuda kernel;
+  // e.g. x.dims=[2,3,4], x_stride=[12,4,1]
+  std::vector<int> x_strides_array(max_dim);
+  std::vector<int> y_strides_array(max_dim);
+  std::vector<int> out_strides_array(max_dim);
+  int x_stride = 1;
+  int y_stride = 1;
+  int z_stride = 1;
+  for (int i = max_dim - 1; i >= 0; i--) {
+    x_strides_array[i] = x_dims_array[i] == 1 ? 0 : x_stride;
+    y_strides_array[i] = y_dims_array[i] == 1 ? 0 : y_stride;
+    out_strides_array[i] = z_stride;
+    x_stride *= x_dims_array[i];
+    y_stride *= y_dims_array[i];
+    z_stride *= out_dims_array[i];
+  }
+
+  std::vector<int> x_strides_order(max_dim);
+  std::vector<int> y_strides_order(max_dim);
+  std::vector<int> x_dims_order(max_dim);
+  std::vector<int> y_dims_order(max_dim);
+  for (int i = 0; i < max_dim; ++i) {
+    x_strides_order[i] = out_strides_array[x_trans_indexs[i]];
+    y_strides_order[i] = out_strides_array[y_trans_indexs[i]];
+    x_dims_order[i] = out_dims_array[x_trans_indexs[i]];
+    y_dims_order[i] = out_dims_array[y_trans_indexs[i]];
+  }
+  std::vector<int> x_broadcast_pos;
+  std::vector<int> y_broadcast_pos;
+
+  int bytes = max_dim * sizeof(int);
+
+  for (int i = 0; i < max_dim; ++i) {
+    if (x_dims_array[i] != out_dims_array[i] && x_dims_array[i] == 1) {
+      x_broadcast_pos.emplace_back(i);
+    }
+    if (y_dims_array[i] != out_dims_array[i] && y_dims_array[i] == 1) {
+      y_broadcast_pos.emplace_back(i);
+    }
+  }
+
+  auto stream = ctx.stream();
+  bool can_split_x = false;
+  bool can_split_y = false;
+
+  auto FastCommonCUDAF = [&](const std::vector<int> &broadcast_pos, bool is_y) {
+    int h = std::accumulate(out_dims_array,
+                            out_dims_array + broadcast_pos.size(),
+                            1,
+                            std::multiplies<int>());
+    int w = std::accumulate(out_dims_array + broadcast_pos.size(),
+                            out_dims_array + max_dim,
+                            1,
+                            std::multiplies<int>());
+
+    VLOG(3) << "FastCommonCUDAF elementwise w:" << w << " h:" << h
+            << " is_y:" << is_y;
+
+    int split_h;
+    int split_w;
+    int kh = h;
+    int kw = w;
+
+    if (is_y) {
+      split_h = std::accumulate(x_dims_array,
+                                x_dims_array + broadcast_pos.size(),
+                                1,
+                                std::multiplies<int>());
+      split_w = std::accumulate(x_dims_array + broadcast_pos.size(),
+                                x_dims_array + max_dim,
+                                1,
+                                std::multiplies<int>());
+
+    } else {
+      split_h = std::accumulate(y_dims_array,
+                                y_dims_array + broadcast_pos.size(),
+                                1,
+                                std::multiplies<int>());
+      split_w = std::accumulate(y_dims_array + broadcast_pos.size(),
+                                y_dims_array + max_dim,
+                                1,
+                                std::multiplies<int>());
+    }
+
+    if (h > split_h) kh = split_h;
+    if (w > split_w) kw = split_w;
+
+    if (is_y) {
+      if (w < 16 || h < 16) {
+        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+        int grid_size = w;
+        CommonGradBroadcast1CUDAKernelHeight<<<grid_size,
+                                               block_size,
+                                               0,
+                                               stream>>>(x_data,
+                                                         y_data,
+                                                         out_data,
+                                                         dout_data,
+                                                         h,
+                                                         w,
+                                                         dy_op,
+                                                         dy_data,
+                                                         kh,
+                                                         kw,
+                                                         is_y);
+      } else {
+        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
+                                                  block_size,
+                                                  0,
+                                                  stream>>>(x_data,
+                                                            y_data,
+                                                            out_data,
+                                                            dout_data,
+                                                            h,
+                                                            w,
+                                                            dy_op,
+                                                            dy_data,
+                                                            kh,
+                                                            kw,
+                                                            is_y);
+      }
+    } else {
+      if (w < 16 || h < 16) {
+        int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+        int grid_size = w;
+        CommonGradBroadcast1CUDAKernelHeight<<<grid_size,
+                                               block_size,
+                                               0,
+                                               stream>>>(x_data,
+                                                         y_data,
+                                                         out_data,
+                                                         dout_data,
+                                                         h,
+                                                         w,
+                                                         dx_op,
+                                                         dx_data,
+                                                         kh,
+                                                         kw,
+                                                         is_y);
+      } else {
+        dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+        int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+        FastCommonGradBroadcastCUDAKernelHeight<<<grid_size,
+                                                  block_size,
+                                                  0,
+                                                  stream>>>(x_data,
+                                                            y_data,
+                                                            out_data,
+                                                            dout_data,
+                                                            h,
+                                                            w,
+                                                            dx_op,
+                                                            dx_data,
+                                                            kh,
+                                                            kw,
+                                                            is_y);
+      }
+    }
+  };
+
+  auto FastBroadCastHeightCUDAF = [&](const std::vector<int> &broadcast_pos,
+                                      bool x_large) {
+    int h = std::accumulate(out_dims_array,
+                            out_dims_array + broadcast_pos.size(),
+                            1,
+                            std::multiplies<int>());
+    int w = std::accumulate(out_dims_array + broadcast_pos.size(),
+                            out_dims_array + max_dim,
+                            1,
+                            std::multiplies<int>());
+
+    VLOG(3) << "FastBroadCastHeightCUDAF w:" << w << " h:" << h;
+
+    if (w < 16 || h < 16) {
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
+      int grid_size = w;
+      ElemwiseGradBroadcast1CUDAKernel<<<grid_size, block_size, 0, stream>>>(
+          x_data,
+          y_data,
+          out_data,
+          dout_data,
+          h,
+          w,
+          x_large,
+          dx_op,
+          dy_op,
+          dx_data,
+          dy_data);
+    } else {
+      dim3 block_size = dim3(BLOCK_X, BLOCK_Y);
+      int grid_size = (w + BLOCK_X - 1) / BLOCK_X;
+      FastElemwiseGradBroadcast1CUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       h,
+                                                       w,
+                                                       x_large,
+                                                       dx_op,
+                                                       dy_op,
+                                                       dx_data,
+                                                       dy_data);
+    }
+  };
+
+  auto FastBroadCastAllCUDAF = [&](
+      const std::vector<int> &broadcast_pos, int max_dim, bool is_x_large) {
+    int axis = broadcast_pos[0];
+    int pre = std::accumulate(
+        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+    int mid = 1;
+    int post = 1;
+
+    if (broadcast_pos.size() == 1) {
+      mid = out_dims_array[axis];
+      post = std::accumulate(out_dims_array + axis + 1,
+                             out_dims_array + max_dim,
+                             1,
+                             std::multiplies<int>());
+    } else {
+      mid = std::accumulate(out_dims_array + axis,
+                            out_dims_array + broadcast_pos.back() + 1,
+                            1,
+                            std::multiplies<int>());
+      post = std::accumulate(out_dims_array + broadcast_pos.back() + 1,
+                             out_dims_array + max_dim,
+                             1,
+                             std::multiplies<int>());
+    }
+
+    VLOG(3) << "FastBroadCastAllCUDAF pre:" << pre << " mid:" << mid
+            << " post:" << post;
+
+    int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+    int grid_size = pre * post;
+
+    FastCommonGradBroadcastAllCUDAKernel<<<grid_size, block_size, 0, stream>>>(
+        x_data,
+        y_data,
+        out_data,
+        dout_data,
+        pre,
+        mid,
+        post,
+        is_x_large,
+        dx_op,
+        dy_op,
+        dx_data,
+        dy_data);
+  };
+
+  auto FastBroadCastOneCUDAF = [&](
+      const std::vector<int> &broadcast_pos, int max_dim, bool is_x) {
+    int axis = broadcast_pos[0];
+    int pre = std::accumulate(
+        out_dims_array, out_dims_array + axis, 1, std::multiplies<int>());
+    int mid = out_dims_array[axis];
+    int post = std::accumulate(out_dims_array + axis + 1,
+                               out_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+
+    int k_pre;
+    int k_mid;
+    int k_post;
+
+    if (is_x) {
+      k_pre = std::accumulate(
+          y_dims_array, y_dims_array + axis, 1, std::multiplies<int>());
+      k_mid = y_dims_array[axis];
+      k_post = std::accumulate(y_dims_array + axis + 1,
+                               y_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+      int grid_size = pre * post;
+      // we need to calc y offset with blockid, so do x_pre/y_pre to get left
+      // size.
+      if (k_pre != pre) k_pre = pre / k_pre;
+
+      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       pre,
+                                                       mid,
+                                                       post,
+                                                       k_pre,
+                                                       k_mid,
+                                                       k_post,
+                                                       true,
+                                                       dx_op,
+                                                       dx_data);
+    } else {
+      k_pre = std::accumulate(
+          x_dims_array, x_dims_array + axis, 1, std::multiplies<int>());
+      k_mid = x_dims_array[axis];
+      k_post = std::accumulate(x_dims_array + axis + 1,
+                               x_dims_array + max_dim,
+                               1,
+                               std::multiplies<int>());
+      int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, mid);
+      int grid_size = pre * post;
+      if (k_pre != pre) k_pre = pre / k_pre;
+
+      FastCommonGradBroadcastOneCUDAKernel<<<grid_size,
+                                             block_size,
+                                             0,
+                                             stream>>>(x_data,
+                                                       y_data,
+                                                       out_data,
+                                                       dout_data,
+                                                       pre,
+                                                       mid,
+                                                       post,
+                                                       k_pre,
+                                                       k_mid,
+                                                       k_post,
+                                                       false,
+                                                       dy_op,
+                                                       dy_data);
+    }
+    VLOG(3) << "FastBroadCastOneCUDAF pre:" << pre << " mid:" << mid
+            << " post:" << post;
+  };
+
+  // do fast elementwise if: 1. only one input need to do broadcast, we can
+  // fallback
+  // to old fast path.
+  // 2. if both x and y need broadcast, then do it one by one.
+  bool fast_broadcast = false;
+  if (x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
+    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    if (can_split_y) {
+      // only y need to do broadcast on h
+      if (y_broadcast_pos[0] == 0) {
+        FastBroadCastHeightCUDAF(y_broadcast_pos, true);
+        fast_broadcast = true;
+      }
+    } else if (y_broadcast_pos.size() == 1 ||
+               CheckContiguousDims(y_broadcast_pos)) {  // for only one dim and
+                                                        // contiguous broadcast.
+      // If cannot split,  which means input has 3 parts
+      FastBroadCastAllCUDAF(y_broadcast_pos, max_dim, true);
+      fast_broadcast = true;
+    }
+  } else if (y_broadcast_pos.empty() && !x_broadcast_pos.empty()) {
+    // only x need broadcast
+    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    if (can_split_x) {
+      if (x_broadcast_pos[0] == 0) {
+        FastBroadCastHeightCUDAF(x_broadcast_pos, false);
+        fast_broadcast = true;
+      }
+    } else if (x_broadcast_pos.size() == 1 ||
+               CheckContiguousDims(x_broadcast_pos)) {
+      FastBroadCastAllCUDAF(x_broadcast_pos, max_dim, false);
+      fast_broadcast = true;
+    }
+  } else if (!x_broadcast_pos.empty() && !y_broadcast_pos.empty()) {
+    // do x and y broadcast each.
+    can_split_y = SplitDims(y_broadcast_pos, max_dim);
+    bool fast_broadcast_x = false;
+    bool fast_broadcast_y = false;
+    if (can_split_y) {
+      // begin at start.
+      if (y_broadcast_pos[0] == 0) {
+        FastCommonCUDAF(y_broadcast_pos, true);
+        fast_broadcast_y = true;
+      }
+    } else if (y_broadcast_pos.size() == 1) {
+      FastBroadCastOneCUDAF(y_broadcast_pos, max_dim, false);
+      can_split_y = true;
+      fast_broadcast_y = true;
+    }
+    can_split_x = SplitDims(x_broadcast_pos, max_dim);
+    if (can_split_x) {
+      if (x_broadcast_pos[0] == 0) {
+        FastCommonCUDAF(x_broadcast_pos, false);
+        fast_broadcast_x = true;
+      }
+    } else if (x_broadcast_pos.size() == 1) {
+      FastBroadCastOneCUDAF(x_broadcast_pos, max_dim, true);
+      can_split_x = true;
+      fast_broadcast_x = true;
+    }
+    VLOG(3) << "CommonBroadcast can_split_y:" << can_split_y
+            << " can_split_x:" << can_split_x;
+    // if both x and y into fast path then return
+    if (fast_broadcast_x && fast_broadcast_y) {
+      fast_broadcast = true;
+    }
+    if (can_split_y && can_split_x && fast_broadcast) return;
+  }
+
+  // Should remove memory copy, use reg instead.
+  if (fast_broadcast) {
+    return;
+  }
+  int x_blocks = 0;
+  int x_threads = 0;
+  ComputeBroadcastKernelSize(
+      x_dims_array, out_dims_array, &x_blocks, &x_threads, max_dim);
+  int y_blocks = 0;
+  int y_threads = 0;
+  ComputeBroadcastKernelSize(
+      y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim);
+
+  auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *x_strides_array_gpu =
+      reinterpret_cast<int *>(x_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       x_strides_array_gpu,
+                       cplace,
+                       x_strides_array.data(),
+                       bytes,
+                       ctx.stream());
+
+  auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *y_strides_array_gpu =
+      reinterpret_cast<int *>(y_strides_array_tmp->ptr());
+  paddle::memory::Copy(gplace,
+                       y_strides_array_gpu,
+                       cplace,
+                       y_strides_array.data(),
+                       bytes,
+                       ctx.stream());
+
+  auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes);
+  int *out_dims_array_gpu = reinterpret_cast<int *>(out_dims_array_tmp->ptr());
+  paddle::memory::Copy(
+      gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream());
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads);
+  int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads);
+  if (dx) {
+    auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *x_strides_order_gpu =
+        reinterpret_cast<int *>(x_strides_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         x_strides_order_gpu,
+                         cplace,
+                         x_strides_order.data(),
+                         bytes,
+                         ctx.stream());
+
+    auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *x_dims_order_gpu = reinterpret_cast<int *>(x_dims_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         x_dims_order_gpu,
+                         cplace,
+                         x_dims_order.data(),
+                         bytes,
+                         ctx.stream());
+    CommonGradBroadcastCUDAKernel<
+        T,
+        DX_OP,
+        Tout><<<x_blocks, x_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                           y_strides_array_gpu,
+                                                           out_dims_array_gpu,
+                                                           x_strides_order_gpu,
+                                                           x_dims_order_gpu,
+                                                           x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           dx_data,
+                                                           out_size,
+                                                           max_dim,
+                                                           x_threads,
+                                                           dx_op);
+  }
+  if (dy) {
+    auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *y_strides_order_gpu =
+        reinterpret_cast<int *>(y_strides_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         y_strides_order_gpu,
+                         cplace,
+                         y_strides_order.data(),
+                         bytes,
+                         ctx.stream());
+
+    auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes);
+    int *y_dims_order_gpu = reinterpret_cast<int *>(y_dims_order_tmp->ptr());
+    paddle::memory::Copy(gplace,
+                         y_dims_order_gpu,
+                         cplace,
+                         y_dims_order.data(),
+                         bytes,
+                         ctx.stream());
+    CommonGradBroadcastCUDAKernel<
+        T,
+        DY_OP,
+        Tout><<<y_blocks, y_block_size, 0, ctx.stream()>>>(x_strides_array_gpu,
+                                                           y_strides_array_gpu,
+                                                           out_dims_array_gpu,
+                                                           y_strides_order_gpu,
+                                                           y_dims_order_gpu,
+                                                           x_data,
+                                                           y_data,
+                                                           out_data,
+                                                           dout_data,
+                                                           dy_data,
+                                                           out_size,
+                                                           max_dim,
+                                                           y_threads,
+                                                           dy_op);
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
+                                        const DDim &x_dims,
+                                        const DDim &y_dims,
+                                        const DenseTensor &x,
+                                        const DenseTensor &y,
+                                        const DenseTensor &out,
+                                        const DenseTensor &dout,
+                                        int axis,
+                                        DenseTensor *dx,
+                                        DenseTensor *dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  funcs::GetBroadcastDimsArrays(x_dims,
+                                y_dims,
+                                x_dims_array.data(),
+                                y_dims_array.data(),
+                                out_dims_array.data(),
+                                max_dim,
+                                axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << paddle::framework::make_ddim(x_dims_array)
+          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(x,
+                                                 y,
+                                                 out,
+                                                 dout,
+                                                 dx,
+                                                 dy,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 ctx,
+                                                 dx_op,
+                                                 dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
+                                      const DDim &x_dims,
+                                      const DDim &y_dims,
+                                      const DenseTensor &x,
+                                      const DenseTensor &y,
+                                      const DenseTensor &out,
+                                      const DenseTensor &dout,
+                                      int axis,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      paddle::platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    paddle::platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    funcs::get_mid_dims(x_dims,
+                        y_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    funcs::get_mid_dims(y_dims,
+                        x_dims_trimed,
+                        axis_trim,
+                        &pre,
+                        &n,
+                        &post,
+                        &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CUDA(
+        ctx.stream(),
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    ElemwiseGradBroadcast2CUDA(
+        ctx.stream(),
+        x.data<T>(),
+        y.data<T>(),
+        out.data<Tout>(),
+        dout.data<Tout>(),
+        pre,
+        n,
+        post,
+        is_xsize_larger,
+        dx_op,
+        dy_op,
+        dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
+        dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
 }  // namespace pten

From 4c77a9086c488a9a0b11d4e7f0c406c31716345e Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Fri, 14 Jan 2022 15:38:49 +0800
Subject: [PATCH 20/24] Add dygraph sharding stage3 (#38052)

---
 paddle/pten/core/dense_tensor.cc              |   4 +
 .../meta_parallel/sharding/sharding_stage3.py | 675 ++++++++++++++++++
 .../meta_parallel/sharding/sharding_utils.py  |  31 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/dygraph_sharding_stage3.py      | 233 ++++++
 .../unittests/test_dygraph_sharding_stage3.py |  31 +
 6 files changed, 960 insertions(+), 17 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py

diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 0b5f5cb18e13d..eb6f834d72779 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -435,6 +435,10 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
 }
 
 void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
   if (storage_ != nullptr && tensor.storage_ != nullptr) {
     storage_->set_data_shared(tensor.storage_->data_shared());
   }
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
new file mode 100644
index 0000000000000..e5d04aac1551e
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -0,0 +1,675 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import time
+import contextlib
+import logging
+import functools
+import numpy as np
+from itertools import chain
+from functools import reduce
+from types import MethodType
+from collections import deque, OrderedDict
+
+import paddle
+from paddle import nn
+from paddle.autograd import PyLayer
+import paddle.fluid.core as core
+import paddle.distributed as dist
+from paddle.fluid.framework import ParamBase
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed.collective import _get_global_group
+
+from .sharding_utils import Type, ShardingClipGrad
+from ..pp_utils.utils import _all_gather
+
+# CUDA alignment 256 bytes
+alignment = {"gpu": 256, }
+align = {
+    Type.fp16.value: 2,
+    Type.fp32.value: 4,
+}
+
+global CHECK_LAYER
+CHECK_LAYER = dict()  # Help to check layer's id -> layer's name
+
+
+class ShardingStage3(nn.Layer):
+    """ 
+    A wrapper for Sharding Stage3 Layer in Dygraph. 
+
+    .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
+
+    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
+    """
+
+    def __init__(self,
+                 layer,
+                 optimizer,
+                 group=None,
+                 sync_buffers=False,
+                 device="gpu",
+                 pertrain_sync_models=True,
+                 accumulate_grads=False,
+                 offload=False,
+                 sync_comm=False):
+        super().__init__()
+
+        # Default configs
+        assert core.is_compiled_with_cuda(), "Only support CUDA."
+        self._layer = layer
+        self._default_device = device
+        self.__sync_buffers = sync_buffers
+        self._accumulate_grads = accumulate_grads
+        self._offload = offload
+        self._sync_comm = sync_comm
+
+        # Communication group establishment
+        self._group = dist.new_group(_get_global_group()
+                                     .ranks) if group is None else group
+        self._world_size_scaling = 1.0 / self._group.nranks
+        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
+        self._rank = self._group.rank
+        self._global_root_rank = 0  # picking rank 0 as the reference
+        self._global_ranks = self._group.ranks
+        self._param2buffer_size = dict()  # {param.name: size}
+        self._param2buffer = dict(
+        )  # {param.name: [(start0, end0),(start1, end1), ...]}
+        self._trainable_params = dict()  # {layer.name: [trainable_params]}
+
+        assert not isinstance(
+            optimizer, list), "Multiple optimizers are not supported now."
+        self._optim = _OptimizerWrapper(optimizer, self._offload, self._group,
+                                        self._update_params_slice)
+        self._ori_parameter_list = self._optim._parameter_list
+        self._ori_param_groups = self._optim._param_groups
+
+        # Replace optimizer's _grad_clip
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      paddle.get_device(),
+                                                      self._group)
+
+        # Synchronous all ranks models
+        if pertrain_sync_models:
+            self._sync_params_and_buffers()
+
+        self._segment_rank_params(self._layer)
+
+        # In the first step, record the execution order of the layer
+        self._order_tracer = OrderedDict()
+        self._order_tracer["order"] = 0
+        self._order_tracer["layer"] = []
+        # Register task flow
+        self._task_flow = TaskFlow()
+        # Register forward hooks
+        self._register_forward_hooks(self._layer)
+        # Register backward parameter hooks
+        self._register_backward_hooks()
+        # Redefine optimizer step and clear function
+        self._redefine_opt_step()
+        self._redefine_opt_clear()
+
+    @paddle.no_grad()
+    def _sync_params_and_buffers(self):
+        """
+        Sync all model states for all ranks
+        """
+
+        for p in self._layer.parameters():
+            dist.broadcast(
+                p,
+                src=self._global_root_rank,
+                group=self._group,
+                use_calc_stream=True)
+
+        # Multi stream operation will be supported later
+        dist.wait(tensor=p, group=self._group, use_calc_stream=True)
+
+    def _clear_gradients(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param, "fw_storage"
+            ), "Find {} don't have fw_storage attribute.".format(param.name)
+
+            # param.bw_storage.zero_()
+            param.fw_storage.clear_gradient(False)
+            param.fw_storage._gradient_set_empty(False)
+            param.bw_storage._clear()
+
+    # Update param memery slice
+    def _update_params_slice(self):
+        update_list = self._update_params()
+
+        if not isinstance(self._optim._param_groups[0], dict):
+            slice_params = [param.fw_storage for param in update_list]
+            self._optim._parameter_list = slice_params
+            self._optim._param_groups = slice_params
+        else:
+            params_name_list = list(map(lambda p: p.name, update_list))
+            for param_group in self._optim._param_groups:
+                slice_p = []
+                for p in param_group['params']:
+                    if p.name in params_name_list:
+                        assert hasattr(
+                            p, "fw_storage"
+                        ), "Find {} don't have fw_storage attribute.".format(
+                            p.name)
+                        slice_p.append(p.fw_storage)
+                    param_group['params'] = slice_p
+
+    def forward(self, *inputs, **kwargs):
+        """
+        A wrapper for Sharding Stage3 layer.
+        """
+        # 1.Sync layer's buffers state
+        if self.__sync_buffers:
+            self._sync_buffers()
+
+        # 2.Normal FW on the base model
+        fw = self._layer(*inputs, **kwargs)
+
+        return fw
+
+    def _segment_rank_params(self, layer, name="last_layer"):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            CHECK_LAYER[id(layer)] = name
+            self._flatten_layer_params(layer, current_layer_params)
+
+        for name, sub_layer in layer.named_children():
+            self._segment_rank_params(sub_layer, name)
+
+    def _flatten_layer_params(self, layer, current_layer_params):
+        def _add_manage_info(trainable_param):
+            return _PartitionParam(trainable_param)
+
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        assert id(layer) not in self._trainable_params.keys()
+        self._trainable_params[id(layer)] = list(
+            map(_add_manage_info, trainable_params))
+
+        for param in self._trainable_params[id(layer)]:
+            if param.name in self._param2buffer.keys():
+                continue
+            self._param2buffer[param.name] = []
+            # 1.Params alignment
+            offset = 0
+            # CUDA alignment 256 bytes
+            size = param._numel() * align[param.dtype]
+            remaining = size % alignment[self._default_device]
+            ali = 0 if remaining == 0 else alignment[
+                self._default_device] - remaining
+            align_ = ali // align[param.dtype]
+
+            offset = align_ + param._numel()
+            buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - (
+                offset % self._group.nranks)
+            self._param2buffer_size[param.name] = buffer_size
+
+            # 2.Combination param buffer
+            assert buffer_size % self._group.nranks == 0
+            pre_buffer = buffer_size // self._group.nranks
+
+            for rank_ in range(self._group.nranks):
+                self._param2buffer[param.name].append(
+                    (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
+
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
+    def _param_storage(self, param, buffer_size):
+        assert isinstance(buffer_size, int)
+        value = np.zeros(
+            buffer_size,
+            dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros(
+                buffer_size, dtype=np.float32)
+        buffer = core.VarBase(value=value, place=core.CPUPlace())
+
+        param_shape = param.shape
+        origin_state = param.stop_gradient
+        param.stop_gradient = True
+        param.flatten_()
+        param.stop_gradient = origin_state
+        start, end = self._param2buffer[param.name][self._rank]
+
+        # Copy the current param value
+        tmp_var = core.VarBase(
+            tensor=buffer._slice(0, param._numel()), place=core.CPUPlace())
+        param_cpu = param.cpu()
+        tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
+                                         core.CPUPlace())
+        param.value().get_tensor()._set_dims(param_shape)
+        param._clear()
+
+        # Current rank param_storage
+        param.fw_storage = core.VarBase(
+            buffer._slice(start, end), "slice@" + param.name)
+        param.status = "part"
+
+        # Updata optimizer master weights
+        if param.dtype == Type.fp16.value:
+            self._optim._master_weights[param.fw_storage.name] = paddle.cast(
+                param.fw_storage, Type.fp32.value)
+
+    def _register_forward_hooks(self, layer):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            self._register_forward_all_hooks(layer, self._task_flow)
+
+        for _, sub_layer in layer.named_children():
+            self._register_forward_hooks(sub_layer)
+
+    def _register_forward_all_hooks(self, sub_layer, task_flow):
+        def _forward_pre_hook(layer, inputs):
+            return ForwardPreHooks(layer, self._order_tracer,
+                                   self._trainable_params, self._param2buffer,
+                                   self._rank, self._group, self._sync_comm,
+                                   task_flow)
+
+        def _forward_post_hook(layer, inputs, outputs):
+            return ForwardPostHooks.apply(
+                outputs, layer, self._order_tracer, self._trainable_params,
+                self._param2buffer, self._param2buffer_size, self._rank,
+                self._group, self._sync_comm, task_flow)
+
+        # register previous forward hooks
+        sub_layer.register_forward_pre_hook(_forward_pre_hook)
+
+        # register post forward hooks
+        sub_layer.register_forward_post_hook(_forward_post_hook)
+
+    @paddle.no_grad()
+    def _sync_buffers(self):
+        for buffer in self._layer.buffers(include_sublayers=True):
+            dist.broadcast(
+                buffer,
+                self._global_root_rank,
+                self._group,
+                use_calc_stream=True)
+        # Multi stream operation will be supported later
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped layer."""
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self._layer, name)
+
+    def _update_params(self):
+        update_list = []
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            if self._accumulate_grads:
+                param.bw_storage.scale_(scale=self._world_size_scaling)
+            param.fw_storage = _VarBaseWrapper(param)
+            param.fw_storage._copy_gradient_from(param.bw_storage)
+            update_list.append(param)
+        return update_list
+
+    def get_all_parameters(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            if param.use_count > 0:
+                continue
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            full_param = _all_gather(
+                param.fw_storage, self._group, use_calc_stream=True)
+            dist.wait(
+                tensor=full_param, group=self._group, use_calc_stream=True)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+
+        self._optim._parameter_list = self._ori_parameter_list
+        self._optim._param_groups = self._ori_param_groups
+
+    def _register_backward_hooks(self):
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+
+        for param in trainable_params:
+            allreduce_function = self._get_allreduce_fn(param)
+            param._register_backward_hook(allreduce_function)
+
+    def _get_allreduce_fn(self, param):
+        @paddle.no_grad()
+        def reduce(*_):
+            if param.name in self._task_flow.full_grad.keys():
+                full_grad = self._task_flow.full_grad[param.name]
+                with paddle.amp.auto_cast(enable=False):
+                    if not self._accumulate_grads:
+                        full_grad.scale_(scale=self._world_size_scaling)
+                    # Only support sync allreduce current rank's layer now
+                    dist.all_reduce(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+                    dist.wait(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+
+                    start, end = self._param2buffer[param.name][self._rank]
+                    if not self._accumulate_grads or param.bw_storage is None:
+                        param.bw_storage = core.VarBase(
+                            full_grad._slice(start, end)).detach().clone()
+                    else:
+                        param.bw_storage.add_(
+                            core.VarBase(full_grad._slice(start, end)).detach()
+                            .clone())
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+                tmp_var = self._task_flow.full_grad.pop(param.name)
+                tmp_var._clear()
+
+            if param.name in self._task_flow.full_param.keys():
+                if param.status == "all":
+                    param.use_count = 0
+                    param._clear()
+                    start, end = self._param2buffer[param.name][self._rank]
+                    with paddle.amp.auto_cast(enable=False):
+                        param.fw_storage = core.VarBase(
+                            self._task_flow.full_param[param.name]._slice(start,
+                                                                          end),
+                            param.name + "@slice").detach().clone()
+                    param.status = "part"
+                    tmp_var = self._task_flow.full_param.pop(param.name)
+                    tmp_var._clear()
+
+        return reduce
+
+    def _redefine_opt_step(self):
+        params_slice_func = self._update_params_slice
+        opt_step = self._optim.step
+        update_scaler = self._optim.update_scaler
+
+        def _opt_step(self):
+            if not update_scaler:
+                params_slice_func()
+            opt_step()
+
+        self._optim.step = MethodType(_opt_step, self._optim)
+
+    def _redefine_opt_clear(self):
+        clear_func = self._clear_gradients
+
+        def _opt_clear(self):
+            clear_func()
+
+        self._optim.clear_grad = MethodType(_opt_clear, self._optim)
+
+
+def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
+                    group, sync_comm, task_flow):
+
+    # Record layer's id
+    layer_id = id(layer)
+    use_calc, sync_wait = False, False
+
+    if layer_id not in order_tracer.keys() or sync_comm:
+        use_calc, sync_wait = True, True
+        task_flow.use_calc[layer_id] = use_calc
+    else:
+        task_flow.use_calc[layer_id] = use_calc
+        _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+
+        if layer_id == order_tracer["layer"][-1]: return
+        order_ = order_tracer[layer_id]
+        layer_id = order_tracer["layer"][order_ + 1]
+    _allgather_buffer(
+        layer_id,
+        trainable_params,
+        group,
+        use_calc_stream=use_calc,
+        task_flow=task_flow,
+        sync_wait=sync_wait)
+    return
+
+
+class ForwardPostHooks(PyLayer):
+    @staticmethod
+    def forward(ctx, inputs, layer, order_tracer, trainable_params,
+                param2buffer, param2buffer_size, rank, group, sync_comm,
+                task_flow):
+        _release_param(layer, trainable_params, param2buffer, rank, task_flow)
+
+        layer_id = id(layer)
+        if layer_id not in order_tracer.keys():
+            order_ = order_tracer["order"]
+            order_tracer[layer_id] = order_
+            order_tracer["order"] += 1
+            order_tracer["layer"].append(layer_id)
+        ctx.order_tracer = order_tracer
+        ctx.task_flow = task_flow
+        ctx.group = group
+        ctx.layer = layer
+        ctx.sync_comm = sync_comm
+        ctx.trainable_params = trainable_params
+        ctx.param2buffer_size = param2buffer_size
+
+        return inputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        # Load context value
+        order_tracer = ctx.order_tracer
+        task_flow = ctx.task_flow
+        group = ctx.group
+        layer = ctx.layer
+        trainable_params = ctx.trainable_params
+        param2buffer_size = ctx.param2buffer_size
+        sync_comm = ctx.sync_comm
+        layer_id = id(layer)
+        use_calc, sync_wait = False, False
+        if sync_comm:
+            use_calc, sync_wait = True, True
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+        else:
+            _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+        _create_params_grad(layer, trainable_params, param2buffer_size,
+                            task_flow)
+        task_flow.use_calc[layer_id] = use_calc
+        if layer_id != order_tracer["layer"][0] and not sync_comm:
+            layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
+            _allgather_buffer(
+                layer_next_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+
+        return args
+
+
+class TaskFlow:
+    """
+    Task flows, one way linked list for task acquisition.
+    """
+
+    def __init__(self,
+                 full_param=dict(),
+                 full_grad=dict(),
+                 use_calc=dict(),
+                 callback=None):
+        self.full_param = full_param
+        self.full_grad = full_grad
+        self.use_calc = use_calc
+        self.callback = callback
+
+
+def _release_param(layer, trainable_params, param2buffer, rank, task_flow):
+    for param in trainable_params[id(layer)]:
+        # async communicate share weight not clear
+        param.use_count -= 1
+        if param.use_count == 0:
+            param._clear()
+            if param.name in task_flow.full_param.keys():
+                start, end = param2buffer[param.name][rank]
+                with paddle.amp.auto_cast(enable=False):
+                    param.fw_storage = core.VarBase(
+                        task_flow.full_param[param.name]._slice(start, end),
+                        param.name + "@slice").detach().clone()
+                param.status = "part"
+                tmp_var = task_flow.full_param.pop(param.name)
+                tmp_var._clear()
+    return
+
+
+def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        if param.name in task_flow.full_param.keys():
+            full_param = task_flow.full_param[param.name]
+            with paddle.amp.auto_cast(enable=False):
+                paddle.device.cuda.synchronize()
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        else:
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream,
+                task_flow,
+                sync_wait=True)
+            break
+    return task_flow
+
+
+def _allgather_buffer(layer_id,
+                      trainable_params,
+                      group,
+                      use_calc_stream,
+                      task_flow,
+                      sync_wait=False):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        with paddle.amp.auto_cast(enable=False):
+            full_param = _all_gather(
+                param.fw_storage, group, use_calc_stream=use_calc_stream)
+        if sync_wait:
+            with paddle.amp.auto_cast(enable=False):
+                dist.wait(
+                    tensor=full_param,
+                    group=group,
+                    use_calc_stream=use_calc_stream)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        task_flow.full_param[param.name] = full_param
+    return task_flow
+
+
+@paddle.no_grad()
+def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow):
+    for param in trainable_params[id(layer)]:
+        if param.name in task_flow.full_grad.keys():
+            continue
+        assert isinstance(param2buffer_size[param.name], int)
+        temp_grad = paddle.zeros(
+            [param2buffer_size[param.name]], dtype=param.dtype)
+        param._copy_gradient_from(
+            core.VarBase(temp_grad._slice(0, param._numel())))
+        task_flow.full_grad[param.name] = temp_grad
+    return task_flow
+
+
+def _PartitionParam(param):
+    if not hasattr(param, "fw_storage"):
+        setattr(param, "fw_storage", None)
+        setattr(param, "bw_storage", None)
+        setattr(param, "status", "all")
+        setattr(param, "use_count", 0)
+    return param
+
+
+def _VarBaseWrapper(param):
+    varbase = param.fw_storage
+    tmp_param = ParamBase(
+        shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name)
+    varbase._share_buffer_to(tmp_param)
+    tmp_param.regularizer = param.regularizer
+    tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[
+        'learning_rate']
+    varbase._clear()
+    return tmp_param
+
+
+def _OptimizerWrapper(optimizer, offload, group, update_params_slice):
+    if not hasattr(optimizer, "_optim"):
+        setattr(optimizer, "_optim", optimizer)
+        setattr(optimizer, "offload", offload)
+        setattr(optimizer, "group", group)
+        setattr(optimizer, "update_scaler", None)
+        setattr(optimizer, "update_slice", update_params_slice)
+    return optimizer
+
+
+def _current_layer_params(layer):
+    return layer.parameters(
+        include_sublayers=False) + list(layer.extra_parameters) if hasattr(
+            layer, "extra_parameters") else layer.parameters(
+                include_sublayers=False)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 272aada576be8..5f696195c1abc 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -152,6 +152,9 @@ def unscale_method(self, optimizer):
         param_grads = []
         param_grads_fp16 = []
         param_grads_fp32 = []
+        if hasattr(optimizer, "update_slice"):
+            optimizer.update_slice()
+            optimizer.update_scaler = True
 
         if getattr(optimizer._optim, '_param_groups', None) and isinstance(
                 optimizer._optim._param_groups[0], dict):
@@ -161,27 +164,21 @@ def unscale_method(self, optimizer):
                     if param._grad_ivar() is not None:
                         param_grads.append(param._grad_ivar())
                         if param._grad_ivar(
-                        ).dtype == core.VarDesc.VarType.FP16:
+                        ).dtype in [core.VarDesc.VarType.FP16, paddle.float16]:
                             param_grads_fp16.append(param._grad_ivar())
                         else:
                             param_grads_fp32.append(param._grad_ivar())
         else:
-            param_grads = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if param._grad_ivar() is not None
-            ]
-            param_grads_fp16 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
-                           )
-            ]
-            param_grads_fp32 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
-                           )
-            ]
+            for param in optimizer._optim._parameter_list:
+                if param.grad is not None:
+                    param_grads.append(param.grad)
+                    if param.grad.dtype in [
+                            core.VarDesc.VarType.FP16, paddle.float16
+                    ]:
+                        param_grads_fp16.append(param.grad)
+                    else:
+                        param_grads_fp32.append(param.grad)
+
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 67697fcfd8398..c0c13866ccd55 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -34,6 +34,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
+list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -250,6 +251,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1058,6 +1060,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..5b0bec9c454b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -0,0 +1,233 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+epoch = 10
+batch_size = 32
+paddle.seed(2021)
+np.random.seed(2021)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=[{
+            "params": model.parameters()
+        }] if opt_group else model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              use_pure_fp16=False,
+              accumulate_grad=False,
+              opt_group=False,
+              recompute=False):
+    group = paddle.distributed.new_group([0, 1])
+    if opt_group:
+        optimizer = optimizer_setting(
+            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+    else:
+        optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if use_pure_fp16:
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+        scaler = ShardingScaler(scaler)
+    if sharding_stage == 2:
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(), optim=optimizer, group=group)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            buffer_max_size=2**21,
+            accumulate_grads=accumulate_grad)
+    elif sharding_stage == 3:
+        model = ShardingStage3(
+            model, optimizer=optimizer, group=group, sync_comm=recompute)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if not accumulate_grad:
+                if not use_pure_fp16:
+                    avg_loss.backward()
+                    optimizer.step()
+                else:
+                    scaler.scale(avg_loss).backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                optimizer.clear_grad()
+        if accumulate_grad:
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            optimizer.clear_grad()
+    if sharding_stage == 3:
+        model.get_all_parameters()
+    return model.parameters()
+
+
+def test_stage2_stage3():
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8 = MLP(), MLP(), MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+    mlp7.set_state_dict(state_dict)
+    mlp8.set_state_dict(state_dict)
+    # fp32 
+    stage2_params = train_mlp(
+        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    stage3_params = train_mlp(
+        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp32 accumulate grad
+    stage2_params = train_mlp(
+        mlp3,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    stage3_params = train_mlp(
+        mlp4,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16
+    stage2_params = train_mlp(
+        mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
+    stage3_params = train_mlp(
+        mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16 recompute
+    stage3_params = train_mlp(
+        mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    stage3_params_re = train_mlp(
+        mlp8,
+        sharding_stage=3,
+        use_pure_fp16=True,
+        opt_group=False,
+        recompute=True)
+    for i in range(len(stage3_params)):
+        for j in range(len(stage3_params_re)):
+            if stage3_params[i].name == stage3_params_re[j].name:
+                np.testing.assert_allclose(
+                    stage3_params[i].numpy(),
+                    stage3_params_re[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_stage2_stage3()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..89d5f2e8c7b29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage3(TestMultipleGpus):
+
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_optimizer_stage3(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage3.py')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0de8a805a89eb70203163a34858ff504afff30df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:05:00 +0800
Subject: [PATCH 21/24] [infrt] update the version of llvm. test=develop
 (#38843)

---
 cmake/external/llvm.cmake                     |  13 +-
 paddle/infrt/CMakeLists.txt                   |   1 -
 paddle/infrt/common/global.h                  |   2 +-
 paddle/infrt/dialect/CMakeLists.txt           |   6 +-
 paddle/infrt/dialect/basic_kernels.cc         |  22 +--
 paddle/infrt/dialect/basic_kernels.h          |   5 +-
 paddle/infrt/dialect/basic_kernels.td         |   7 +-
 paddle/infrt/dialect/dense_tensor.cc          | 148 +++++-------------
 paddle/infrt/dialect/dense_tensor.h           |  51 ++++--
 paddle/infrt/dialect/diagnostic_utils.cc      |   7 +-
 paddle/infrt/dialect/diagnostic_utils.h       |   6 +-
 paddle/infrt/dialect/dialect.cc               |  16 +-
 paddle/infrt/dialect/infrt_base.cc            |   6 +-
 paddle/infrt/dialect/infrt_base.h             |  32 ++--
 paddle/infrt/dialect/infrt_base.td            |   6 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |  12 +-
 paddle/infrt/dialect/init_infrt_dialects.h    |   8 +-
 paddle/infrt/dialect/mlir_loader.cc           |  18 ++-
 paddle/infrt/dialect/mlir_loader.h            |   9 +-
 paddle/infrt/dialect/mlir_loader_test.cc      |  11 +-
 paddle/infrt/dialect/mlir_tests/rewrite.mlir  |   2 +-
 .../dialect/mlir_tests/rewrite_conv_bn.mlir   |   2 +-
 paddle/infrt/dialect/mlir_tests/trt_ops.mlir  |   2 +-
 paddle/infrt/dialect/ops.td                   |   6 -
 paddle/infrt/dialect/opt.cc                   |  26 +--
 paddle/infrt/dialect/pd_op_base.td            |   2 +-
 paddle/infrt/dialect/pd_ops.cc                |  29 ++--
 paddle/infrt/dialect/pd_ops.h                 |  36 ++---
 paddle/infrt/dialect/pd_ops.td                |  14 +-
 paddle/infrt/dialect/pd_types.h               |  11 +-
 paddle/infrt/dialect/print_ir.cc              |  45 +++---
 paddle/infrt/dialect/tensor_shape.cc          |  16 +-
 paddle/infrt/dialect/tensor_shape.h           |   8 +-
 paddle/infrt/dialect/tensor_shape_base.td     |   4 +-
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |   4 +-
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   |  78 +++++----
 .../dialect/tensorrt/trt_graph_fuse_pass.h    |  12 +-
 .../dialect/tensorrt/trt_graph_split_pass.cc  |  20 +--
 .../dialect/tensorrt/trt_graph_split_pass.h   |  10 +-
 .../dialect/tensorrt/trt_op_teller_pass.cc    |  25 ++-
 .../dialect/tensorrt/trt_op_teller_pass.h     |  14 +-
 paddle/infrt/dialect/tensorrt/trt_ops.cc      |  22 ++-
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  41 +++--
 paddle/infrt/dialect/test_kernels.cc          |  75 ++++-----
 paddle/infrt/dialect/test_kernels.h           |   7 +-
 paddle/infrt/dialect/types.cc                 |  17 --
 paddle/infrt/dialect/types.h                  |  16 --
 paddle/infrt/host_context/core_runtime.cc     |   6 +-
 paddle/infrt/host_context/core_runtime.h      |   6 +-
 paddle/infrt/host_context/kernel_frame.h      |   6 +-
 .../host_context/kernel_registry_test.cc      |   6 +-
 .../infrt/host_context/kernel_utils_test.cc   |   6 +-
 .../host_context/mlir_function_executable.cc  |   1 +
 .../host_context/mlir_function_executable.h   |   3 +-
 .../host_context/mlir_program_executor.h      |   4 +-
 .../host_context/mlir_to_runtime_translate.cc |  90 ++++++-----
 .../host_context/mlir_to_runtime_translate.h  |   8 +-
 .../mlir_to_runtime_translate_test.cc         |  12 +-
 paddle/infrt/host_context/op_executable.cc    |   7 +-
 paddle/infrt/host_context/op_executable.h     |  12 +-
 paddle/infrt/kernel/basic_kernels.cc          |   6 +-
 paddle/infrt/kernel/basic_kernels.h           |  12 +-
 paddle/infrt/kernel/tensor_kernels.cc         |   6 +-
 paddle/infrt/kernel/tensor_kernels.h          |  12 +-
 paddle/infrt/kernel/tensor_shape_kernels.cc   |   6 +-
 paddle/infrt/kernel/tensor_shape_kernels.h    |  12 +-
 paddle/infrt/kernel/test_kernels.cc           |   6 +-
 paddle/infrt/kernel/test_kernels.h            |  12 +-
 paddle/infrt/paddle/cpp/desc_api.h            |   8 +-
 paddle/infrt/paddle/model_parser.cc           |   6 +-
 paddle/infrt/paddle/model_parser.h            |   6 +-
 paddle/infrt/paddle/pb/block_desc.cc          |   8 +-
 paddle/infrt/paddle/pb/block_desc.h           |   8 +-
 paddle/infrt/paddle/pb/op_desc.cc             |   8 +-
 paddle/infrt/paddle/pb/op_desc.h              |   8 +-
 paddle/infrt/paddle/pb/program_desc.cc        |   8 +-
 paddle/infrt/paddle/pb/program_desc.h         |   8 +-
 paddle/infrt/paddle/pb/var_desc.cc            |   8 +-
 paddle/infrt/paddle/pb/var_desc.h             |   8 +-
 79 files changed, 616 insertions(+), 637 deletions(-)
 delete mode 100644 paddle/infrt/dialect/ops.td
 delete mode 100644 paddle/infrt/dialect/types.cc
 delete mode 100644 paddle/infrt/dialect/types.h

diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index e080a7359af98..27210e5260048 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -1,7 +1,7 @@
 include(FetchContent)
 
-set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
-set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
+set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
 
 set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
 set(FETCHCONTENT_QUIET OFF)
@@ -51,7 +51,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 # To build with MLIR, the LLVM is build from source code using the following flags:
 
 #[==[
-cmake -G Ninja ../llvm \
+cmake ../llvm  -G "Unix Makefiles" \
   -DLLVM_ENABLE_PROJECTS="mlir;clang" \
   -DLLVM_BUILD_EXAMPLES=OFF \
   -DLLVM_TARGETS_TO_BUILD="X86" \
@@ -59,8 +59,10 @@ cmake -G Ninja ../llvm \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DLLVM_ENABLE_ZLIB=OFF \
   -DLLVM_ENABLE_RTTI=ON \
+  -DLLVM_INSTALL_UTILS=ON \
+  -DCMAKE_INSTALL_PREFIX=./install
 #]==]
-# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
 
 add_definitions(${LLVM_DEFINITIONS})
 
@@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS})
 
 
 # The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
 
 
 # tb_base is the name of a xxx.td file (without the .td suffix)
@@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base)
   mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
   if (mlir_tablegen_on_DIALECT)
     mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
   endif()
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 8f05d286bf033..8af3012a220ad 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -77,7 +77,6 @@ add_subdirectory(paddle)
 
 # MLIR td file generations
 set(infrt_mlir_incs
-        ops_inc
         basic_kernels_inc
         test_kernels_inc
         infrt_base_inc
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
index f89164d03f31d..e6586cb3a3c60 100644
--- a/paddle/infrt/common/global.h
+++ b/paddle/infrt/common/global.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "mlir/IR/MLIRContext.h"
+#include <mlir/IR/MLIRContext.h>
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index d145843684c63..c064b2145266b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,7 +2,6 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    types.cc
     basic_kernels.cc
     test_kernels.cc
     infrt_base.cc
@@ -14,8 +13,6 @@ gather_srcs(infrt_src SRCS
     pd_types.cc
     pd_ops.cc
     )
-
-mlir_tablegen_on(ops)
 mlir_tablegen_on(basic_kernels)
 mlir_tablegen_on(test_kernels)
 mlir_tablegen_on(infrt_base DIALECT infrt)
@@ -27,8 +24,7 @@ mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
-target_link_libraries(infrtopt infrt ${mlir_libs})
-add_dependencies(infrtopt infrt)
+target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
index b4d2b9182b0c5..bad7e73ec5ae5 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -17,17 +17,17 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 using namespace mlir;  // NOLINT
 
 static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
@@ -71,12 +71,12 @@ static ParseResult parseConstantF64Op(OpAsmParser &parser,       // NOLINT
 static ParseResult parseConstantI32Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(32, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 32), parser, result);
 }
 static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(64, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 64), parser, result);
 }
 
 static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
@@ -90,10 +90,10 @@ static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
 }
 
 static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << "infrt.call " << op.getAttr("callee") << "(";
+  p << "infrt.call " << op->getAttr("callee") << "(";
   p.printOperands(op.getOperands());
   p << ")";
-  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
   p << " : ";
 }
 
@@ -145,7 +145,7 @@ static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
 static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op.getParentOp());
+  auto function = dyn_cast<FuncOp>(op->getParentOp());
 
   if (!function) return success();
 
@@ -157,8 +157,8 @@ static LogicalResult verify(ReturnOp op) {
 
   return success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h
index 65316bc1437c0..b82abcd52d28f 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/basic_kernels.h
@@ -13,12 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-using namespace mlir;  // NOLINT
-
-namespace infrt::dialect {
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index df5e4d8a2c6a1..7d8de79fbae2b 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -27,7 +27,7 @@ def CallOp : INFRT_Op<"call"> {
   let results = (outs Variadic<AnyType>);
 
   let extraClassDeclaration = [{
-      StringRef getCallee() { return callee(); }
+      mlir::StringRef getCallee() { return callee(); }
       mlir::FunctionType getCalleeType();
     }];
 }
@@ -57,9 +57,8 @@ def ReturnOp : INFRT_Op<"return", [Terminator]> {
 
   let arguments = (ins Variadic<AnyType>:$operands);
 
-  let builders = [OpBuilder<
-                  "OpBuilder &b, OperationState &result",
-                  [{ build(b, result, llvm::None); }]>];
+  let builders = [OpBuilder<(ins),
+                  [{ build($_builder, $_state, llvm::None); }]>];
 }
 
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 629a7b16523fc..7685cdc65b9ad 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -17,12 +17,11 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
@@ -31,68 +30,37 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
-namespace infrt::dt {
-
+namespace infrt {
+namespace dt {
 void DTDialect::initialize() {
-  allowUnknownTypes();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
 
-namespace detail {
-struct TensorTypeStorage : public mlir::TypeStorage {
-  TensorTypeStorage(TargetType target,
-                    LayoutType layout,
-                    PrecisionType precision)
-      : target_(target), layout_(layout), precision_(precision) {}
-
-  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
-
-  bool operator==(const KeyTy &key) const {
-    return key == KeyTy(target_, layout_, precision_);
-  }
-
-  static llvm::hash_code hashKey(const KeyTy &key) {
-    return llvm::hash_value(key);
-  }
-
-  static TensorTypeStorage *construct(
-      mlir::TypeStorageAllocator &allocator,  // NOLINT
-      const KeyTy &key) {
-    return new (allocator.allocate<TensorTypeStorage>())
-        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
-  }
-
-  TargetType target_;
-  LayoutType layout_;
-  PrecisionType precision_;
-};
-}  // namespace detail
-
 llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
-  if (key.equals_lower("x86"))
+  if (key.equals_insensitive("x86"))
     return TargetType::X86;
-  else if (key.equals_lower("cuda"))
+  else if (key.equals_insensitive("cuda"))
     return TargetType::CUDA;
   else
     return llvm::None;
 }
 
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
-  if (key.equals_lower("nchw"))
+  if (key.equals_insensitive("nchw"))
     return LayoutType::NCHW;
-  else if (key.equals_lower("nhwc"))
+  else if (key.equals_insensitive("nhwc"))
     return LayoutType::NHWC;
   else
     return llvm::None;
 }
 
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
-  if (key.equals_lower("i32"))
+  if (key.equals_insensitive("i32"))
     return PrecisionType::I32;
-  else if (key.equals_lower("f32"))
+  else if (key.equals_insensitive("f32"))
     return PrecisionType::F32;
   else
     return llvm::None;
@@ -111,7 +79,7 @@ LayoutType TensorType::layout() { return getImpl()->layout_; }
 
 PrecisionType TensorType::precision() { return getImpl()->precision_; }
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) {
   os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
      << ", " << tensorType.precision() << ">";
   return os;
@@ -133,7 +101,7 @@ StringType StringType::get(mlir::MLIRContext *context) {
   return Base::get(context);
 }
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) {
   switch (type) {
     case (TargetType::X86):
       os << "X86";
@@ -147,7 +115,7 @@ raw_ostream &operator<<(raw_ostream &os, TargetType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) {
   switch (type) {
     case (LayoutType::NCHW):
       os << "NCHW";
@@ -161,7 +129,7 @@ raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) {
   switch (type) {
     case (PrecisionType::I32):
       os << "I32";
@@ -175,103 +143,69 @@ raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
   return os;
 }
 
-static Type getTensorType(mlir::MLIRContext *context) {
-  auto t_dialect = Identifier::get("t", context);
-  return OpaqueType::get(t_dialect, "tensor", context);
+static mlir::Type getTensorType(mlir::MLIRContext *context) {
+  auto t_dialect = mlir::Identifier::get("t", context);
+  return mlir::OpaqueType::get(t_dialect, "tensor");
 }
 
-static ParseResult parseCreateUninitTensorOp(
-    OpAsmParser &parser,       // NOLINT
-    OperationState &result) {  // NOLINT
+static mlir::ParseResult parseCreateUninitTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
   auto loc = parser.getCurrentLocation();
-  ::mlir::Type outputRawTypes[1];
-  ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes);
+  mlir::Type outputRawTypes[1];
+  ::llvm::ArrayRef<mlir::Type> outputTypes(outputRawTypes);
 
   mlir::ArrayAttr shapeAttr;
   if (parser.parseAttribute(shapeAttr,
                             parser.getBuilder().getI64Type(),
                             "shape",
                             result.attributes))
-    return failure();
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+    return mlir::failure();
+  if (parser.parseOptionalAttrDict(result.attributes)) return mlir::failure();
 
-  if (parser.parseArrow()) return failure();
-  if (parser.parseType(outputRawTypes[0])) return failure();
+  if (parser.parseArrow()) return mlir::failure();
+  if (parser.parseType(outputRawTypes[0])) return mlir::failure();
   if (!outputRawTypes[0].isa<TensorType>())
     return parser.emitError(loc, "invalid kind of type specified");
   result.addTypes(outputTypes);
-  return success();
+  return mlir::success();
 }
 
 template <typename CreateUninitTensorOp>
-static void printCreateUninitTensorOp(OpAsmPrinter &p,  // NOLINT
+static void printCreateUninitTensorOp(mlir::OpAsmPrinter &p,  // NOLINT
                                       CreateUninitTensorOp op) {
   p << CreateUninitTensorOp::getOperationName();
   p << " ";
   p.printAttributeWithoutType(op.shapeAttr());
-  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"});
+  p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"shape"});
   p << " -> ";
   p << op.getOperation()->getResultTypes();
 }
 
-// TODO(shibo): can be removed?
-// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser,
-// OperationState& result) {
-//  auto loc = parser.getCurrentLocation();
-//  ::mlir::OpAsmParser::OperandType inputRawOperands[1];
-//  ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType>
-//  inputOperands(inputRawOperands);
-//  ::mlir::Type inputRawTypes[1];
-//  ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes);
-//
-//  if (parser.parseOperand(inputRawOperands[0])) return failure();
-//
-//  if (parser.parseColon()) return failure();
-//  if (parser.parseType(inputRawTypes[0])) return failure();
-//  if (!inputRawTypes[0].isa<TensorType>())
-//    return parser.emitError(loc, "invalid kind of type specified");
-//
-//  Attribute value_attr;
-//  if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands))
-//  return failure();
-//  if (parser.parseAttribute(value_attr, "value", result.attributes)) return
-//  failure();
-//  return success();
-//}
-
-// TODO(shibo): can be removed?
-// template <typename FillTensorOp>
-// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) {
-//  p << FillTensorOp::getOperationName();
-//  p << " ";
-//  p.printOperand(op.getOperand());
-//  p << " : ";
-//  p << op.getOperation()->getOperandTypes();
-//  p << " ";
-//  p << op.getAttr("value");
-//}
-
-static ParseResult parseSetTensorOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 1> operands;
-  if (parser.parseOperandList(operands, 1)) return failure();
+static mlir::ParseResult parseSetTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 1> operands;
+  if (parser.parseOperandList(operands, 1)) return mlir::failure();
 
   auto tensor_type = getTensorType(result.getContext());
 
-  Attribute value_attr;
-  return failure(
+  mlir::Attribute value_attr;
+  return mlir::failure(
       parser.resolveOperand(operands[0], tensor_type, result.operands) ||
       parser.parseAttribute(value_attr, "values", result.attributes));
 }
 
 template <typename SetTensorOp>
-static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
+static void printSetTensorOp(mlir::OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
   p << SetTensorOp::getOperationName() << " ";
   p.printOperand(op.getOperand());
-  p << " " << op.getAttr("values");
+  p << " " << op->getAttr("values");
 }
+}  // namespace dt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"  // NOLINT
 
-}  // namespace infrt::dt
+#include "paddle/infrt/dialect/dense_tensor_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 866c62213ab05..416925d3382ba 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,13 +19,8 @@
 
 #include <string>
 
-using namespace mlir;  // NOLINT
-namespace infrt::dt {
-
-namespace detail {
-struct TensorTypeStorage;
-}  // namespace detail
-
+namespace infrt {
+namespace dt {
 enum class TargetType : uint8_t { X86, CUDA };
 enum class LayoutType : uint8_t { NCHW, NHWC };
 enum class PrecisionType : uint8_t { I32, F32 };
@@ -34,9 +29,39 @@ llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type);
-raw_ostream &operator<<(raw_ostream &os, LayoutType type);
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type);
+
+namespace detail {
+struct TensorTypeStorage : public mlir::TypeStorage {
+  TensorTypeStorage(TargetType target,
+                    LayoutType layout,
+                    PrecisionType precision)
+      : target_(target), layout_(layout), precision_(precision) {}
+
+  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(target_, layout_, precision_);
+  }
+
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  static TensorTypeStorage *construct(
+      mlir::TypeStorageAllocator &allocator,  // NOLINT
+      const KeyTy &key) {
+    return new (allocator.allocate<TensorTypeStorage>())
+        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  TargetType target_;
+  LayoutType layout_;
+  PrecisionType precision_;
+};
+}  // namespace detail
 
 class TensorType : public mlir::Type::TypeBase<TensorType,
                                                mlir::Type,
@@ -52,7 +77,7 @@ class TensorType : public mlir::Type::TypeBase<TensorType,
   PrecisionType precision();
 };
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType);
 
 class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
                                                   mlir::Type,
@@ -70,10 +95,10 @@ class StringType
   static StringType get();
   static StringType get(mlir::MLIRContext *context);
 };
+}  // namespace dt
+}  // namespace infrt
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
-
-}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
index a28176e38fdc7..4151001067ecb 100644
--- a/paddle/infrt/dialect/diagnostic_utils.cc
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -14,9 +14,11 @@
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 
+#include <llvm/Support/raw_ostream.h>
 #include <string>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 struct MyScopedDiagnosicHandler::Impl {
   Impl() : diag_stream_(diag_str_) {}
@@ -49,4 +51,5 @@ mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) {
   return mlir::failure(true);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h
index 3a8098cf75181..746e61c8fe5c3 100644
--- a/paddle/infrt/dialect/diagnostic_utils.h
+++ b/paddle/infrt/dialect/diagnostic_utils.h
@@ -18,7 +18,8 @@
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 /**
  * A scoped diagnostic handler to help debug MLIR process.
@@ -36,4 +37,5 @@ class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler {
   std::unique_ptr<Impl> impl_;
 };
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc
index cbcd5d0f0fa78..fe07b91d22ed5 100644
--- a/paddle/infrt/dialect/dialect.cc
+++ b/paddle/infrt/dialect/dialect.cc
@@ -13,24 +13,26 @@
 // limitations under the License.
 
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::hlir::dialect {
+namespace infrt {
+namespace hlir {
+namespace dialect {
 
-class CinnDialect : public ::mlir::Dialect {
+class CinnDialect : public mlir::Dialect {
  public:
-  explicit CinnDialect(::mlir::MLIRContext* ctx);
+  explicit CinnDialect(mlir::MLIRContext* ctx);
 
   //! We should register this function in dialect
   static llvm::StringRef getDialectNamespace() {
     return "infrt::hlir::dialect";
   }
 };
-
-}  // namespace infrt::hlir::dialect
+}  // namespace dialect
+}  // namespace hlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index b28ad5ad4b5a5..e8005661bbd65 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -18,7 +18,8 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/test_kernels.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 // ----INFRTDialect definition begin----
 void INFRTDialect::initialize() {
@@ -124,4 +125,5 @@ void INFRTDialect::printType(mlir::Type type,
 
 // ----INFRTDialect definition end----
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 58acd7c9a409a..1a7fbcf395a6e 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -18,19 +18,17 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 
 #include "paddle/infrt/dialect/infrt_base.hpp.inc"
 
-namespace infrt::dialect {
-
-class INFRTDialect : public ::mlir::Dialect {
-  explicit INFRTDialect(::mlir::MLIRContext *context)
-      : ::mlir::Dialect(getDialectNamespace(),
-                        context,
-                        ::mlir::TypeID::get<INFRTDialect>()) {
+namespace infrt {
+namespace dialect {
+class INFRTDialect : public mlir::Dialect {
+  explicit INFRTDialect(mlir::MLIRContext *context)
+      : mlir::Dialect(
+            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
     initialize();
   }
 
@@ -41,15 +39,12 @@ class INFRTDialect : public ::mlir::Dialect {
                  mlir::DialectAsmPrinter &printer) const override;
 
   void initialize();
-  friend class ::mlir::MLIRContext;
+  friend class mlir::MLIRContext;
 
  public:
   static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
 };
-
-}  // namespace infrt::dialect
-
-namespace mlir {
+}  // namespace dialect
 
 template <typename T>
 static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
@@ -58,17 +53,16 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> cvtValueToValueRange(
+static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
     const mlir::Value &operand) {
-  return mlir::SmallVector<::mlir::Value, 4>(1, operand);
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> concatTwoValueRange(
+static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
     mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<::mlir::Value, 4> operands;
+  mlir::SmallVector<mlir::Value, 4> operands;
   operands.append(operand_0.begin(), operand_0.end());
   operands.append(operand_1.begin(), operand_1.end());
   return operands;
 }
-
-}  // namespace mlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 7d6fdbbbf2f68..1abd294236d93 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -28,11 +28,11 @@ def TensorMapType :
 def BufferType : OpaqueType<"b", "buffer", "buffer">;
 
 class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "mlir::createI32Attr($_builder, $_loc, " # value # ")">;
+    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
 
 def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "mlir::cvtValueToValueRange($0)">;
+    "infrt::cvtValueToValueRange($0)">;
 
 def INFRT_concatTwoValueRange : NativeCodeCall<
-    "mlir::concatTwoValueRange($0, $1)">;
+    "infrt::concatTwoValueRange($0, $1)">;
 #endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 4bc2bf70942d2..c3769414dbb39 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -23,12 +23,10 @@
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry) {  // NOLINT
-  registry.insert<ts::TensorShapeDialect>();
-  registry.insert<dialect::INFRTDialect>();
-  registry.insert<dt::DTDialect>();
-  registry.insert<mlir::pd::PaddleDialect>();
+void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
+  registry.insert<ts::TensorShapeDialect,
+                  dialect::INFRTDialect,
+                  dt::DTDialect,
+                  mlir::pd::PaddleDialect>();
 }
-
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h
index 50caca018980d..0912e9ef2555b 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.h
+++ b/paddle/infrt/dialect/init_infrt_dialects.h
@@ -14,10 +14,8 @@
 
 #pragma once
 
-#include "mlir/IR/Dialect.h"
-
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/MLIRContext.h>
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry);  // NOLINT
-
+void registerCinnDialects(mlir::DialectRegistry &registry);  // NOLINT
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index b318a6a763483..1d0696e77dcda 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -16,8 +16,8 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 #include <unordered_map>
@@ -30,12 +30,15 @@
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   // Currenetly, We only used the CinnDialect and mlir::BuiltinDialect is
   // enough。Don't need StandardOpsDialect.
   // context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
@@ -57,9 +60,9 @@ mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
-  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
-
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
@@ -71,4 +74,5 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   return mlir::parseSourceFile(std::string(file_name), context);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
index 092da7d9ce03f..5e50ad9e5a271 100644
--- a/paddle/infrt/dialect/mlir_loader.h
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -15,16 +15,17 @@
 #pragma once
 
 #include <glog/logging.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source);
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context);
-
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 1b622d585ad8e..1115053073044 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -17,14 +17,15 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <llvm/Support/SourceMgr.h>
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/Parser.h>
 
 #include <string>
 
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 TEST(MlirLoader, basic) {
   mlir::MLIRContext context;
@@ -42,8 +43,7 @@ func @main() -> f32 {
 )ROC";
 
   auto module = LoadMlirSource(&context, source);
-  module->verify();
-
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
   LOG(INFO) << "module name: " << module->getOperationName().data();
   for (auto func : module->getOps<mlir::FuncOp>()) {
     LOG(INFO) << "get func " << func.getName().str();
@@ -54,4 +54,5 @@ func @main() -> f32 {
   }
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
index bfad9d1f6924d..5e207634da8e4 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -20,5 +20,5 @@ func @main() -> tensor<?xf32> {
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
-  infrt.return %e2 : tensor<?xf32>
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
index 9ea1ec0ebca36..2889b92b18ef0 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -11,5 +11,5 @@ func @main() -> tensor<?xf32> {
 
   %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
   %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
-  infrt.return %d : tensor<?x3x256x256xf32>
+  "pd.fetch"(%d) {name="output"} :(tensor<?x3x256x256xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
index 009b6d1c19653..d98f107bab41e 100644
--- a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
+++ b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
@@ -18,5 +18,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) :(tensor<?xf32>)->()
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td
deleted file mode 100644
index 264134a447c63..0000000000000
--- a/paddle/infrt/dialect/ops.td
+++ /dev/null
@@ -1,6 +0,0 @@
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
-
-
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, traits>;
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index d90d25230d0c2..5bcf5a23f4c53 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -12,34 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <glog/logging.h>
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Dialect/Affine/IR/AffineOps.h>
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
-#include <mlir/IR/AsmState.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/InitAllDialects.h>
-#include <mlir/InitAllPasses.h>
-#include <mlir/Pass/Pass.h>
-#include <mlir/Pass/PassManager.h>
-#include <mlir/Support/FileUtilities.h>
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-
-#include <iostream>
-
-#include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
 
 int main(int argc, char **argv) {
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
   mlir::registerCanonicalizerPass();
-
   return mlir::failed(
-      mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry));
+      mlir::MlirOptMain(argc, argv, "infrt mlir pass driver", registry));
 }
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index af53df113dfb3..a3e3c4ae59277 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -16,7 +16,7 @@ def PD_Dialect : Dialect {
     This dialect contains the PaddlePaddle operators.
   }];
 
-  let cppNamespace = "::mlir::pd";
+  let cppNamespace = "mlir::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index ce10be6d100f8..fe38996883846 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -14,10 +14,15 @@
 
 #include "paddle/infrt/dialect/pd_ops.h"
 
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
+
 namespace mlir {
 namespace pd {
 PaddleDialect::PaddleDialect(MLIRContext *context)
@@ -36,12 +41,6 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
   return builder.create<ConstantOp>(loc, value);
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
-#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
-
 void ConstantOp::build(OpBuilder &builder,
                        OperationState &state,
                        Attribute value) {
@@ -66,8 +65,8 @@ LogicalResult ConstantOp::inferReturnTypes(
   inferredReturnTypes.push_back(attributes.get("value").getType());
   return success();
 }
-::mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<::mlir::Attribute> operands) {
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
   return value();
 }
 
@@ -82,11 +81,11 @@ LogicalResult ElementwiseAdd::inferReturnTypes(
   return success();
 }
 void ElementwiseAdd::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseMulAdd>(context);
 }
 
-::mlir::OpFoldResult ElementwiseAdd::fold(
+mlir::OpFoldResult ElementwiseAdd::fold(
     llvm::ArrayRef<mlir::Attribute> operands) {
   if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
     if (!operands[0] || !operands[1]) return {};
@@ -154,17 +153,17 @@ LogicalResult MulOp::inferReturnTypes(
 }
 
 void ReluOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseFCRelu>(context);
 }
 
 void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseRepeatedFCRelu2>(context);
 }
 
 void BatchNormOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseBatchNormWithConvPattern>(context);
 }
 
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index 71e0a53988d1a..7d1d1d6f58451 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -14,21 +14,20 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace pd {
@@ -53,9 +52,8 @@ class PaddleDialect : public Dialect {
   }
 };
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
 }  // namespace pd
 }  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
index b020b7ad5dbc7..3addf15082a12 100644
--- a/paddle/infrt/dialect/pd_ops.td
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -24,6 +24,16 @@ def PD_FeedOp : PD_Op<"feed"> {
 def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let summary = "fetch Op";
 
+  let description = [{
+    Return the output tensor from the subgraph.
+  }];
+
+  let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
+}
+
+def PD_ReturnOp : PD_Op<"return", [Terminator]> {
+  let summary = "return Op";
+
   let description = [{
     Fetch tensor from the graph.
   }];
@@ -31,7 +41,7 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
@@ -50,7 +60,7 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">,
+    OpBuilder<(ins "Attribute":$value)>,
   ];
 }
 
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
index 6f9fe56338a9f..0da888a9c0769 100644
--- a/paddle/infrt/dialect/pd_types.h
+++ b/paddle/infrt/dialect/pd_types.h
@@ -18,12 +18,11 @@
 
 #pragma once
 
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
 
 namespace mlir {
 namespace PD {
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index 43a3577b90f10..5cfd16ee85943 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -11,26 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#include <llvm/ADT/Optional.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/ScopedPrinter.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <llvm/Support/raw_os_ostream.hv
+#include <llvm/Support/raw_ostream.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Block.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/Region.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Parser.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Transforms/Passes.h>
 #include <iostream>
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_os_ostream.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Passes.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
@@ -114,17 +113,15 @@ int main(int argc, char **argv) {
   mlir::registerPassManagerCLOptions();
   cl::ParseCommandLineOptions(argc, argv, "mlir demo");
 
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-  // context->allowUnregisteredDialects();
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
+  mlir::MLIRContext context(registry);
   // mlir will verify module automatically after parsing.
   // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051
   // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source,
   // context);
   mlir::OwningModuleRef module_ref =
-      mlir::parseSourceFile(inputFilename, context);
+      mlir::parseSourceFile(inputFilename, &context);
   std::cout << "----------print IR Structure begin----------" << std::endl;
   printOperation(module_ref->getOperation(), 0);
   std::cout << "----------print IR Structure end----------" << std::endl;
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
index ef5a5525cb22f..92c03818264ee 100644
--- a/paddle/infrt/dialect/tensor_shape.cc
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -17,16 +17,16 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 using namespace mlir;  // NOLINT
 
 void TensorShapeDialect::initialize() {
@@ -48,8 +48,8 @@ Type TensorShapeDialect::parseType(DialectAsmParser &parser) const {
   return Type();
 }
 
-void TensorShapeDialect::printType(::mlir::Type type,
-                                   ::mlir::DialectAsmPrinter &os) const {
+void TensorShapeDialect::printType(mlir::Type type,
+                                   mlir::DialectAsmPrinter &os) const {
   if (type.isa<ShapeType>()) {
     os << "shape";
     return;
@@ -61,8 +61,10 @@ void TensorShapeDialect::printType(::mlir::Type type,
   }
   llvm_unreachable("unexpected 'shape' type kind");
 }
+}  // namespace ts
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
 
-}  // namespace infrt::ts
+#include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h
index bd3fa8853675a..af892af735d2a 100644
--- a/paddle/infrt/dialect/tensor_shape.h
+++ b/paddle/infrt/dialect/tensor_shape.h
@@ -17,7 +17,8 @@
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 
 class ShapeType
     : public mlir::Type::TypeBase<ShapeType, mlir::Type, mlir::TypeStorage> {
@@ -31,10 +32,9 @@ class PartialShapeType : public mlir::Type::TypeBase<PartialShapeType,
  public:
   using Base::Base;
 };
+}  // namespace ts
+}  // namespace infrt
 
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.hpp.inc"
 #include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc"
-
-}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td
index ea1c1854d77ca..c3988307f4dd5 100644
--- a/paddle/infrt/dialect/tensor_shape_base.td
+++ b/paddle/infrt/dialect/tensor_shape_base.td
@@ -19,7 +19,7 @@ def TensorShapeDialect : Dialect {
 def TS_Shape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::ShapeType>()">, "!ts.shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.shape type` represents a static tensor shape.
 }];
 }
@@ -27,7 +27,7 @@ BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
 def TS_PartialShape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::PartialShapeType>()">, "!ts.partial_shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.partial_shape type` represents either a static tensor shape, unranked
         tensor shape or a ranked tensor shape with unknown dimension sizes.
 }];
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index dc0f2acb2b733..1baef7a3f77fd 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Pass/PassManager.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 181f462962aee..1da80ef2c3b10 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
-#include "llvm/ADT/SetVector.h"
-#include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/IR/Builders.h"
-#include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -32,9 +31,9 @@ namespace {
 // Reference the function nameed "FlexibleDFS" but defined in:
 // paddle/fluid/framework/ir/subgraph_detector.cc.
 
-bool reverseDfs(std::vector<::mlir::Operation *> source,
-                const std::function<bool(const ::mlir::Operation *)> &func) {
-  std::unordered_set<const ::mlir::Operation *> visited;
+bool reverseDfs(std::vector<mlir::Operation *> source,
+                const std::function<bool(const mlir::Operation *)> &func) {
+  std::unordered_set<const mlir::Operation *> visited;
   while (!source.empty()) {
     auto node = source.back();
     source.pop_back();
@@ -44,7 +43,7 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
     auto values = node->getOperands();
     for (auto value : values) {
       // if the value is a block argument, the node is nullptr.
-      ::mlir::Operation *node = value.getDefiningOp();
+      mlir::Operation *node = value.getDefiningOp();
       if (node != nullptr && !visited.count(node)) {
         source.emplace_back(node);
       }
@@ -54,19 +53,19 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
-                             ::mlir::pd::GraphOp first,
-                             ::mlir::pd::GraphOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
-  ::llvm::SmallVector<::mlir::Value, 4> inputs(first.getOperands()), outputs;
-  for (::mlir::Value input : second.getOperands()) {
+  ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
+  for (mlir::Value input : second.getOperands()) {
     if (input.getDefiningOp() != first) {
       inputs.push_back(input);
     }
   }
-  ::llvm::DenseMap<::mlir::Value, unsigned int> op_output_mapping;
-  for (::mlir::Value output : first.getResults()) {
-    for (::mlir::Operation *user : output.getUsers()) {
+  ::llvm::DenseMap<mlir::Value, unsigned int> op_output_mapping;
+  for (mlir::Value output : first.getResults()) {
+    for (mlir::Operation *user : output.getUsers()) {
       if (user != second && user->getParentOp() != second) {
         op_output_mapping[output] = outputs.size();
         outputs.push_back(output);
@@ -74,19 +73,19 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
       }
     }
   }
-  auto fetch_op = second.getBody()->getTerminator();
-  outputs.append(fetch_op->getOperands().begin(),
-                 fetch_op->getOperands().end());
-  ::llvm::SmallVector<::mlir::Type, 4> fetch_types;
+  auto return_op = second.getBody()->getTerminator();
+  outputs.append(return_op->getOperands().begin(),
+                 return_op->getOperands().end());
+  ::llvm::SmallVector<mlir::Type, 4> return_types;
   for (auto value : outputs) {
-    fetch_types.push_back(value.getType());
+    return_types.push_back(value.getType());
   }
 
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<::mlir::pd::GraphOp>(loc, fetch_types, inputs);
-  ::mlir::Block *block = new ::mlir::Block;
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
                                 second.getBody()->getOperations(),
@@ -98,18 +97,18 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<mlir::pd::FetchOp>(loc, outputs);
+  builder.create<mlir::pd::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
   unsigned int num_result = first.getNumResults();
-  fetch_op = first.getBody()->getTerminator();
+  return_op = first.getBody()->getTerminator();
   for (unsigned int index = 0; index < num_result; ++index) {
     auto origin_value = first.getResult(index);
     if (op_output_mapping.find(origin_value) == op_output_mapping.end()) {
-      origin_value.replaceAllUsesWith(fetch_op->getOperand(index));
+      origin_value.replaceAllUsesWith(return_op->getOperand(index));
     } else {
-      auto inner_value = fetch_op->getOperand(index);
+      auto inner_value = return_op->getOperand(index);
       auto outer_value = graph_op.getResult(op_output_mapping[origin_value]);
       while (!origin_value.use_empty()) {
         auto replace_value =
@@ -128,13 +127,13 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
 
 // Topological sort the function op.
 void topoSortBlock(mlir::Block &body) {  // NOLINT
-  llvm::SetVector<Operation *> toSort;
+  llvm::SetVector<mlir::Operation *> toSort;
   if (body.empty()) return;
   for (auto it = body.rbegin(); it != body.rend(); ++it) {
     toSort.insert(&*it);
   }
-  llvm::SetVector<Operation *> result =
-      ::mlir::topologicalSort(std::move(toSort));
+  llvm::SetVector<mlir::Operation *> result =
+      mlir::topologicalSort(std::move(toSort));
   for (auto *op : result) {
     op->moveBefore(body.getTerminator());
   }
@@ -145,21 +144,21 @@ void topoSortBlock(mlir::Block &body) {  // NOLINT
 // Implementation of the trtGraphFusePass.
 void trtGraphFusePass::runOnFunction() {
   mlir::Block &body = getFunction().front();
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   bool changed = false;
   do {
     changed = false;
     for (auto &op : body) {
-      ::mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        ::mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
-        std::vector<::mlir::Operation *> source_nodes;
+        std::vector<mlir::Operation *> source_nodes;
         for (auto operand : user_op->getOperands()) {
           auto input = operand.getDefiningOp();
           if (input != &op && input != nullptr) {
@@ -167,9 +166,8 @@ void trtGraphFusePass::runOnFunction() {
           }
         }
         // Reverse DFS from the source_nodes.
-        if (!reverseDfs(source_nodes, [&op](const ::mlir::Operation *n) {
-              return n == &op;
-            })) {
+        if (!reverseDfs(source_nodes,
+                        [&op](const mlir::Operation *n) { return n == &op; })) {
           mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index e7134e88f316c..f1e555c6f67ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -28,15 +28,15 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" %m
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  "pd.fetch" %d, %f
  *
@@ -47,13 +47,13 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" %n, %s
  *  } ...
  *  "pd.fetch" %d, %f
  * }
  */
 class trtGraphFusePass
-    : public ::mlir::PassWrapper<trtGraphFusePass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 2b45364de2036..257f2b5285425 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
@@ -22,24 +22,24 @@ namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void trtGraphSplitPass::runOnFunction() {
-  std::vector<::mlir::pd::GraphOp> worklist;
-  ::mlir::Block& block = getFunction().front();
+  std::vector<mlir::pd::GraphOp> worklist;
+  mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    ::mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    ::mlir::pd::GraphOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
-    ::mlir::Block* body = graph_op.getBody();
-    auto fetch_op = body->getTerminator();
-    graph_op.replaceAllUsesWith(fetch_op->getOperands());
+    mlir::Block* body = graph_op.getBody();
+    auto return_op = body->getTerminator();
+    graph_op.replaceAllUsesWith(return_op->getOperands());
     auto copy_range = body->without_terminator();
-    block.getOperations().splice(::mlir::Block::iterator(graph_op),
+    block.getOperations().splice(mlir::Block::iterator(graph_op),
                                  body->getOperations(),
                                  copy_range.begin(),
                                  copy_range.end());
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 092df0cf834e5..d30d186647fc3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -31,9 +31,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -42,11 +42,11 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  */
 class trtGraphSplitPass
-    : public ::mlir::PassWrapper<trtGraphSplitPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 7b7fbb05c1d13..4e8d40b982b2e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -14,49 +14,48 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtOpTellerPass。
 void trtOpTellerPass::runOnFunction() {
-  ::mlir::Block &body = getFunction().front();
-  std::vector<::mlir::Operation *> worklist;
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
   worklist.reserve(body.getOperations().size());
   for (auto &op : body) {
     worklist.push_back(&op);
   }
   // Build GraphOp.
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   while (!worklist.empty()) {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    auto op1 = ::llvm::dyn_cast_or_null<::mlir::pd::FeedOp>(op);
+    auto op1 = ::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op);
     if (op1) continue;
-    auto op2 = ::llvm::dyn_cast_or_null<::mlir::pd::FetchOp>(op);
+    auto op2 = ::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op);
     if (op2) continue;
-    auto op3 = ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(op);
+    auto op3 = ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op);
     if (op3) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<::mlir::pd::GraphOp>(
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
         loc, op->getResultTypes(), op->getOperands());
 
-    ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values;
+    ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
-         ::llvm::SmallVector<::mlir::Value, 4>{graph_op.getODSResults(0)}) {
+         ::llvm::SmallVector<mlir::Value, 4>{graph_op.getODSResults(0)}) {
       tblgen_repl_values.push_back(v);
     }
     op->replaceAllUsesWith(tblgen_repl_values);
     // Build graph op.
-    ::mlir::Block *block = new ::mlir::Block;
+    mlir::Block *block = new mlir::Block;
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<mlir::pd::FetchOp>(loc, op->getResults());
+    builder.create<mlir::pd::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index b03945b3459c0..fb16c974f7fb3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -29,7 +29,7 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -37,23 +37,23 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" (%m)
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
  */
 class trtOpTellerPass
-    : public ::mlir::PassWrapper<trtOpTellerPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtOpTellerPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 4c02238b10e1d..35b7967892caf 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -13,27 +13,25 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-TensorRTDialect::TensorRTDialect(::mlir::MLIRContext *context)
-    : ::mlir::Dialect("trt", context, ::mlir::TypeID::get<TensorRTDialect>()) {
+TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
+    : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
-#undef GET_OP_LIST
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
 }  // namespace trt
 }  // namespace infrt
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index c9043c2280de0..a37491ec1abc7 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -14,37 +14,32 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-class TensorRTDialect : public ::mlir::Dialect {
+class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(::mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext* context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
 };
 
-// mlir bug。 can be removed safety when update mlir to llvm11.
-using namespace mlir;  // NOLINT
+}  // namespace trt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensorrt/trt_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
-}  // namespace trt
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
index 894d96f95ad5c..c4588d7cf8bab 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/test_kernels.h"
 
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-
-namespace infrt::dialect {
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
 
+namespace infrt {
+namespace dialect {
 //===----------------------------------------------------------------------===//
 // BenchmarkOp
 //===----------------------------------------------------------------------===//
@@ -32,65 +31,67 @@ namespace infrt::dialect {
 // ...
 // }
 
-static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  StringAttr nameAttr;
+static mlir::ParseResult parseBenchmarkOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  mlir::StringAttr nameAttr;
   if (parser.parseAttribute(nameAttr, "name", result.attributes))
-    return failure();
+    return mlir::failure();
 
   // Parse the operands, e.g. (%c : i32, %d : f32)
-  if (parser.parseLParen()) return failure();
+  if (parser.parseLParen()) return mlir::failure();
 
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  SmallVector<Type, 4> types;
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 4> operands;
+  llvm::SmallVector<mlir::Type, 4> types;
   llvm::SMLoc type_loc = parser.getCurrentLocation();
 
   if (parser.parseOptionalRParen()) {
     // Parse non-empty operands
     do {
       // Parse %c : i32,
-      OpAsmParser::OperandType operand;
-      Type type;
+      mlir::OpAsmParser::OperandType operand;
+      mlir::Type type;
 
       if (parser.parseOperand(operand) || parser.parseColonType(type))
-        return failure();
+        return mlir::failure();
 
       operands.push_back(operand);
       types.push_back(type);
     } while (succeeded(parser.parseOptionalComma()));
 
-    if (parser.parseRParen()) return failure();
+    if (parser.parseRParen()) return mlir::failure();
   }
 
   if (parser.resolveOperands(operands, types, type_loc, result.operands))
-    return failure();
+    return mlir::failure();
 
   // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1
   do {
-    StringRef attr;
-    Attribute resultAttr;
+    mlir::StringRef attr;
+    mlir::Attribute resultAttr;
     if (parser.parseKeyword(&attr) || parser.parseEqual() ||
         parser.parseAttribute(resultAttr,
                               parser.getBuilder().getIntegerType(32),
                               attr,
                               result.attributes))
-      return failure();
-  } while (succeeded(parser.parseOptionalComma()));
+      return mlir::failure();
+  } while (mlir::succeeded(parser.parseOptionalComma()));
 
   // Set the default attribute num_warmup_runs to 1 if unset
   auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) {
     bool found = llvm::any_of(result.attributes,
-                              [attr_name](const NamedAttribute &attr) {
-                                return attr.first == attr_name;
+                              [attr_name](const mlir::NamedAttribute &attr) {
+                                return attr.getName() == attr_name;
                               });
     if (!found) {
-      IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value);
+      mlir::IntegerAttr default_val =
+          parser.getBuilder().getI32IntegerAttr(value);
       result.addAttribute(attr_name, default_val);
     }
   };
   setDefaultAttrIfUnset("num_warmup_runs", 1);
 
-  Region *target = result.addRegion();
+  mlir::Region *target = result.addRegion();
   return parser.parseRegion(*target,
                             operands,
                             types,
@@ -102,11 +103,11 @@ static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
 //       max_count = 100, duration_secs = 1 {
 // ...
 // }
-static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
+static void print(mlir::OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p << "infrt.benchmark ";
 
   // Print the name attribute, e.g "add.i32"
-  auto name_attr = op.getAttr("name");
+  auto name_attr = op->getAttr("name");
   p << name_attr;
 
   // Print the operands and types, e.g. (%c : i32, %d : f32)
@@ -120,13 +121,13 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
 
   bool need_comma = false;
   // Print the attributes, e.g. max_count = 100, duration_secs = 1
-  for (auto &name_attr : op.getAttrs()) {
-    auto id = name_attr.first;
+  for (auto &name_attr : op->getAttrs()) {
+    auto id = name_attr.getName();
     if (id == "name") continue;
     if (need_comma) p << ", ";
-    auto attr = name_attr.second;
+    auto attr = name_attr.getValue();
     p << id << " = ";
-    if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+    if (auto int_attr = attr.dyn_cast<mlir::IntegerAttr>()) {
       int_attr.getValue().print(p.getStream(), /*isSigned=*/false);
     } else {
       op.emitOpError("Unexpected attribute");
@@ -142,7 +143,7 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
 }
 
-static LogicalResult verify(BenchmarkOp op) {
+static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
@@ -154,10 +155,10 @@ static LogicalResult verify(BenchmarkOp op) {
         "incorrect number of return values. One return value is expected");
   }
 
-  return success();
+  return mlir::success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h
index 29d4209cb7280..73c8a6fb387bc 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/test_kernels.h
@@ -13,11 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::dialect {
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc
deleted file mode 100644
index 6d6f6a20b46c9..0000000000000
--- a/paddle/infrt/dialect/types.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/types.h"
-
-namespace infrt::hlir::mlir {}  // namespace infrt::hlir::mlir
diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h
deleted file mode 100644
index a9a2b61871cc0..0000000000000
--- a/paddle/infrt/dialect/types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mlir/IR/StandardTypes.h>
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
index cdb8cc99ecb26..e3917bd07d242 100644
--- a/paddle/infrt/host_context/core_runtime.cc
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -23,7 +23,8 @@
 #include "paddle/infrt/host_context/op_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct CoreRuntime::Impl {
   KernelRegistry* kernel_registry{};
@@ -90,4 +91,5 @@ llvm::SmallVector<ValueRef, 4> CoreRuntime::GetResults(
 
 CoreRuntime::~CoreRuntime() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
index 802f8b17bb010..acb6a66cac630 100644
--- a/paddle/infrt/host_context/core_runtime.h
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -22,7 +22,8 @@
 
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 class OpExecutable;
@@ -83,4 +84,5 @@ class CoreRuntimeBuilder : public CoreRuntime {
   OpExecutableBuilder* NewOpExecutable(const std::string& op_name);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
index 20cb17dc7fbe2..5186b88fe2c41 100644
--- a/paddle/infrt/host_context/kernel_frame.h
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -21,7 +21,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 /**
  * KernelFrame captures the states(input arguments, attributes, results)
@@ -163,4 +164,5 @@ class KernelFrameBuilder : public KernelFrame {
   }
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc
index f36ec2a1cac7d..7fca56343041c 100644
--- a/paddle/infrt/host_context/kernel_registry_test.cc
+++ b/paddle/infrt/host_context/kernel_registry_test.cc
@@ -18,7 +18,8 @@
 
 #include "paddle/infrt/host_context/kernel_utils.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 
@@ -44,4 +45,5 @@ TEST(KernelRegistry, basic) {
   ASSERT_EQ(results[0]->get<int>(), 3);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
index 1904eb106a293..bebd8d86e50bb 100644
--- a/paddle/infrt/host_context/kernel_utils_test.cc
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -16,7 +16,8 @@
 
 #include <gtest/gtest.h>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 float add_f32(float a, float b) { return a + b; }
@@ -66,4 +67,5 @@ TEST(KernelImpl, pair) {
   ASSERT_EQ(results[1]->get<float>(), 3.f);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 5f8dacf8e448a..47ec27ebec300 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 
 #include <glog/logging.h>
+#include <mlir/IR/BuiltinOps.h>
 
 #include <string>  // NOLINT
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index ba5fa154d6fcc..a6428df86e6b2 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
index b2af4d2d79db5..c2ccb90640b21 100644
--- a/paddle/infrt/host_context/mlir_program_executor.h
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OperationSupport.h>
 #include <unordered_map>
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 25324b1291582..3dbc7a702be38 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,8 +16,9 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
@@ -40,7 +41,8 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 template <typename T>
 std::string DumpToString(T& op) {  // NOLINT
@@ -113,10 +115,10 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
 
 template <>
 boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(32)) {
       return val.getInt();
     }
@@ -125,10 +127,10 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
 }
 template <>
 boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(64)) {
       return val.getInt();
     }
@@ -139,10 +141,10 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
 // TODO(Superjomn) Make double and float parsing share some thing.
 template <>
 boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF32()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -150,10 +152,10 @@ boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF64()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -161,17 +163,17 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::StringAttr>()) return boost::none;
-  return attr->cast<mlir::StringAttr>().getValue().str();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::StringAttr>()) return boost::none;
+  return attr.cast<mlir::StringAttr>().getValue().str();
 }
 
 #define PROCESS_ARRAY_INT(type__, bits__)                                      \
   template <>                                                                  \
   boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
-      const mlir::Attribute* attr) {                                           \
-    if (!attr->isa<mlir::ArrayAttr>()) return boost::none;                     \
-    auto array = attr->cast<mlir::ArrayAttr>();                                \
+      const mlir::Attribute& attr) {                                           \
+    if (!attr.isa<mlir::ArrayAttr>()) return boost::none;                      \
+    auto array = attr.cast<mlir::ArrayAttr>();                                 \
     CHECK(!array.empty());                                                     \
                                                                                \
     if (!array[0].getType().isInteger(bits__)) {                               \
@@ -191,9 +193,9 @@ PROCESS_ARRAY_INT(int64_t, 64);
 
 template <>
 boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF32()) return boost::none;
@@ -207,9 +209,9 @@ boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF64()) return boost::none;
@@ -236,7 +238,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
   for (int i = 0, e = op->getNumOperands(); i < e; i++) {
     // function argument as value
     auto operand = op->getOperand(i);
-    if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    if (operand.isa<mlir::BlockArgument>()) {
       mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
       Value* arg_value = GetValue(arg);
       impl_->cur_op->AppendArgument(arg_value);
@@ -283,25 +286,25 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
-    if (auto v = EmitAttribute<int32_t>(&attr.second)) {
+    if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<int64_t>(&attr.second)) {
+    } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<float>(&attr.second)) {
+    } else if (auto v = EmitAttribute<float>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<double>(&attr.second)) {
+    } else if (auto v = EmitAttribute<double>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<std::string>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int16_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int32_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int64_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<float>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<double>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else {
       LOG(FATAL) << "Not supported attribute type";
@@ -330,7 +333,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     llvm::SmallVector<mlir::Type, 0> results;
 
     auto func_type =
-        mlir::FunctionType::get(inputs, results, region.getContext());
+        mlir::FunctionType::get(region.getContext(), inputs, results);
     auto* function = impl_->cur_op->CreateFunctionExecutable(
         &region, func_type, &impl_->func_defs);
     impl_->cur_op->AppendAttribute(new Value(function));
@@ -555,4 +558,5 @@ void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) {
   execute.Run();
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 598e81bfd96d8..fcd79eaf386ee 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -29,7 +29,8 @@ class Attribute;
 class Value;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class CoreRuntimeBuilder;
 class Value;
@@ -73,7 +74,7 @@ class MlirToRuntimeTranslator {
   bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
 
   template <typename T>
-  boost::optional<T> EmitAttribute(const mlir::Attribute* attr);
+  boost::optional<T> EmitAttribute(const mlir::Attribute& attr);
 
   Value* GetOpResult(mlir::Operation* op);
 
@@ -104,4 +105,5 @@ void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
  */
 void TestMlir(mlir::ModuleOp module, KernelRegistry* registry);
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 9b85be977ab6c..375daa4515e17 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -29,7 +29,8 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 TEST(MlirToRuntimeTranslate, basic) {
   mlir::MLIRContext context;
@@ -48,7 +49,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -74,7 +75,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -115,7 +116,7 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   // LOG(INFO) << "content: " << content << std::endl;
 
   auto module = dialect::LoadMlirSource(context, content);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   host_context::KernelRegistry registry;
 
@@ -157,4 +158,5 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   }
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 6b10ed473719e..cf40d7315c6a5 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/host_context/op_executable.h"
 
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include "paddle/infrt/host_context/kernel_frame.h"
@@ -21,7 +22,8 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct OpExecutable::Impl {
   Impl(const std::string& op_name,
@@ -148,4 +150,5 @@ void OpExecutable::Execute() {
 
 OpExecutable::~OpExecutable() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
index e2248225a5caf..550f6ab6349ed 100644
--- a/paddle/infrt/host_context/op_executable.h
+++ b/paddle/infrt/host_context/op_executable.h
@@ -14,19 +14,18 @@
 
 #pragma once
 #include <llvm/ADT/ArrayRef.h>
-
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Region.h"
-
 namespace mlir {
 class FuncOp;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class SymbolTable;
 class KernelRegistry;
@@ -89,4 +88,5 @@ class OpExecutableBuilder : public OpExecutable {
       function_defs_t* function_defs);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index d7f2c3865157d..b186cfcfd2b35 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -23,7 +23,8 @@
 
 using infrt::host_context::Attribute;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 template <typename T>
 T add(T a, T b) {
@@ -82,4 +83,5 @@ void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
   registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h
index 9e98885cf6ebf..feb66be61f530 100644
--- a/paddle/infrt/kernel/basic_kernels.h
+++ b/paddle/infrt/kernel/basic_kernels.h
@@ -15,13 +15,16 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the basic kernels to \p registry.
@@ -31,4 +34,5 @@ void RegisterBasicKernels(host_context::KernelRegistry* registry);
 void RegisterIntBasicKernels(host_context::KernelRegistry* registry);
 void RegisterFloatBasicKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 2fa477aa4dbda..51e0004922374 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,7 +25,8 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 using namespace host_context;  // NOLINT
 using namespace tensor;        // NOLINT
 
@@ -76,4 +77,5 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShallowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h
index 8f2180ba80a4f..df8e25c32393c 100644
--- a/paddle/infrt/kernel/tensor_kernels.h
+++ b/paddle/infrt/kernel/tensor_kernels.h
@@ -14,12 +14,16 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 struct KernelRegistry;
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc
index a04b492819298..4edbecfa10886 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.cc
+++ b/paddle/infrt/kernel/tensor_shape_kernels.cc
@@ -24,7 +24,8 @@
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void PrintShape(const tensor::TensorShape& shape) {
   llvm::raw_os_ostream oos(std::cout);
@@ -35,4 +36,5 @@ void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h
index e87c6c37e88a0..e31a37463be43 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.h
+++ b/paddle/infrt/kernel/tensor_shape_kernels.h
@@ -14,14 +14,18 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorShapeKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d5f64d09b602f..ccfb3356a855f 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -33,7 +33,8 @@ using infrt::host_context::Attribute;
 using infrt::host_context::MlirFunctionExecutable;
 using infrt::host_context::RemainingArguments;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 namespace {
 class BenchmarkStats {
  public:
@@ -197,4 +198,5 @@ void RegisterTestKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShadowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h
index f42884dfaf2c9..f5639ec1afaad 100644
--- a/paddle/infrt/kernel/test_kernels.h
+++ b/paddle/infrt/kernel/test_kernels.h
@@ -15,17 +15,21 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the test kernels to registry.
  */
 void RegisterTestKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h
index ccd79c048ab14..3b2dcb0018b2f 100644
--- a/paddle/infrt/paddle/cpp/desc_api.h
+++ b/paddle/infrt/paddle/cpp/desc_api.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <vector>
 
-namespace infrt::paddle::cpp {
+namespace infrt {
+namespace paddle {
+namespace cpp {
 
 /*
  * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
@@ -226,4 +228,6 @@ class ProgramDescAPI {
   virtual void SetVersion(int64_t version) = 0;
 };
 
-}  // namespace infrt::paddle::cpp
+}  // namespace cpp
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
index 285280e69435b..f3de1a630451c 100644
--- a/paddle/infrt/paddle/model_parser.cc
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -22,7 +22,8 @@
 #include "paddle/infrt/common/target.h"
 #include "paddle/infrt/common/type.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 
 int SizeOfType(framework_proto::VarType::Type type) {
   using Type = framework_proto::VarType::Type;
@@ -169,4 +170,5 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) {
   LoadLoDTensor(fin, out, target);
 }
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
index 73125fadedb82..373f77033dcef 100644
--- a/paddle/infrt/paddle/model_parser.h
+++ b/paddle/infrt/paddle/model_parser.h
@@ -25,7 +25,8 @@
 #include "paddle/infrt/paddle/scope.h"
 #include "paddle/infrt/paddle/tensor.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 namespace framework_proto = ::paddle::framework::proto;
 
 // Read a __model__ file.
@@ -52,4 +53,5 @@ void TensorFromStream(
     const common::Target& target = common::DefaultHostTarget());
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc
index 11186bc68af16..5b28fa5464c54 100644
--- a/paddle/infrt/paddle/pb/block_desc.cc
+++ b/paddle/infrt/paddle/pb/block_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/block_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(
@@ -40,4 +42,6 @@ framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
   return desc_->add_ops();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h
index 9c1b7f9adf172..c9e325699a4bc 100644
--- a/paddle/infrt/paddle/pb/block_desc.h
+++ b/paddle/infrt/paddle/pb/block_desc.h
@@ -18,7 +18,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -74,4 +76,6 @@ class BlockDesc : public cpp::BlockDescAPI {
   framework_proto::BlockDesc* desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc
index c7b1e66f50642..32dcefb1ac684 100644
--- a/paddle/infrt/paddle/pb/op_desc.cc
+++ b/paddle/infrt/paddle/pb/op_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/op_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr>
 FindAttr(framework_proto::OpDesc *desc, const std::string &name) {
@@ -136,4 +138,6 @@ GET_ATTRS_IMPL(std::vector<std::string>, strings);
 GET_ATTR_IMPL(std::string, s);
 GET_ATTRS_IMPL(std::vector<int64_t>, longs);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h
index 81d57d9f32252..2829f2aca2e08 100644
--- a/paddle/infrt/paddle/pb/op_desc.h
+++ b/paddle/infrt/paddle/pb/op_desc.h
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/framework.pb.h"
 #include "paddle/infrt/support/variant.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -195,4 +197,6 @@ template <>
 void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
                                        const std::vector<int> &v);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc
index ed8a7e36e0129..9d725485a974d 100644
--- a/paddle/infrt/paddle/pb/program_desc.cc
+++ b/paddle/infrt/paddle/pb/program_desc.cc
@@ -17,7 +17,9 @@
 #include <algorithm>
 #include <limits>
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(
@@ -32,4 +34,6 @@ ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
   return desc_->add_blocks();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h
index 4adad650c974d..b1e64f8e86611 100644
--- a/paddle/infrt/paddle/pb/program_desc.h
+++ b/paddle/infrt/paddle/pb/program_desc.h
@@ -21,7 +21,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 class ProgramDesc : public cpp::ProgramDescAPI {
@@ -58,4 +60,6 @@ class ProgramDesc : public cpp::ProgramDescAPI {
   framework_proto::ProgramDesc *desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc
index cf80df4f1b845..7ea2e24da3446 100644
--- a/paddle/infrt/paddle/pb/var_desc.cc
+++ b/paddle/infrt/paddle/pb/var_desc.cc
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 cpp::VarDescAPI::Type VarDesc::GetType() const {
   auto type = desc_->type().type();
@@ -364,4 +366,6 @@ VarDesc::mutable_tensor_descs() {
   return std::vector<framework_proto::VarType::TensorDesc *>();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h
index 4cff5fdee0375..7215ba6bb6aa7 100644
--- a/paddle/infrt/paddle/pb/var_desc.h
+++ b/paddle/infrt/paddle/pb/var_desc.h
@@ -23,7 +23,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 // convert between std::vector and protobuf repeated.
@@ -121,4 +123,6 @@ class VarDesc : public cpp::VarDescAPI {
   framework_proto::VarDesc *desc_;
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt

From 87ee3e4f5438c567796e128b73eb7703aa56d2ec Mon Sep 17 00:00:00 2001
From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:15:47 +0800
Subject: [PATCH 22/24] [XPU]add stack_grad op for kunlun2,*test=kunlun
 (#38674)

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun,*test=kunlun

* [XPU]add stack_grad op for kunlun2,*test=kunlun

Co-authored-by: QingshuChen <chenqingshu@baidu.com>
---
 paddle/fluid/operators/stack_op_xpu.cc        | 43 ++++++++++++++++---
 .../fluid/platform/device/xpu/xpu1_op_list.h  |  1 +
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  2 +
 .../tests/unittests/xpu/test_stack_op_xpu.py  | 19 +++++++-
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index 01ec4a2b16b4a..a2590e1180c1a 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/stack_op.h"
 #include <string>
-#ifdef PADDLE_WITH_XPU
+#include <vector>
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
@@ -59,14 +62,44 @@ class StackXPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class StackGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    auto axis = ctx.Attr<int>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto dy_dims = dy->dims();
+
+    if (axis < 0) axis += dy_dims.size() + 1;
+    auto dy_shape = framework::vectorize<int>(dy_dims);
+
+    std::vector<int> dx_dims_list(dx.size(), 1);
+    std::vector<T*> dx_lists;
+    for (auto out : dx) {
+      dx_lists.push_back(out->mutable_data<T>(ctx.GetPlace()));
+    }
+
+    int r = xpu::split<T>(dev_ctx.x_context(), dy->data<T>(), dx_lists,
+                          dy_shape, dx_dims_list, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "The stack_grad XPU kernel return wrong value[%d %s]",
+                          r, XPUAPIErrorMsg[r]));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
-
 REGISTER_OP_XPU_KERNEL(stack,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, float>,
                        ops::StackXPUKernel<plat::XPUDeviceContext, int>,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(stack_grad,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, int>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index 26a1426bea036..a76bdd4ae9679 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -300,6 +300,7 @@ XPUOpMap& get_kl1_ops() {
                                 pOpKernelType(vartype::UINT8, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 79261a5d7bc88..3d140b4693a6f 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -333,6 +333,8 @@ XPUOpMap& get_kl2_ops() {
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
index 68e5a6ccdbfb7..20446aee41ec7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -66,6 +66,15 @@ def test_check_output(self):
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
+    def test_check_grad(self):
+        if self.dtype == 'int64' or self.dtype == 'int32':
+            pass
+        else:
+            if paddle.is_compiled_with_xpu():
+                paddle.enable_static()
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, self.get_x_names(), 'Y')
+
 
 class TestStackOp1(TestStackOpBase):
     def initParameters(self):
@@ -81,11 +90,17 @@ class TestStackOp3(TestStackOpBase):
     def initParameters(self):
         self.axis = -1
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp4(TestStackOpBase):
     def initParameters(self):
         self.axis = -4
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp5(TestStackOpBase):
     def initParameters(self):
@@ -113,7 +128,7 @@ def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
         self.axis = 0
-        self.dtype = 'int'
+        self.dtype = 'int32'
 
     def initParameters(self):
         self.num_inputs = 16

From 050aa6fe5a524b0e7b85201c54a0da315701518d Mon Sep 17 00:00:00 2001
From: heliqi <heliqi@baidu.com>
Date: Fri, 14 Jan 2022 16:50:56 +0800
Subject: [PATCH 23/24]  add flatten_contiguous_range OpConvert for Paddle-TRT
 (#38922)

* add trt_convert_flatten_contiguous_rang op

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* test cast add trt version >=7 skip
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |   7 +-
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../convert/flatten_contiguous_range_op.cc    | 136 ++++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  32 +++++
 ...st_trt_convert_flatten_contiguous_range.py | 115 +++++++++++++++
 6 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index ef50df3084f8c..55bbc55450876 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -46,8 +46,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
               << " is diabled by config in TensorRT";
       return false;
     }
-    return tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
-                                             with_dynamic_shape);
+    bool is_ok = tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
+                                                   with_dynamic_shape);
+    if (!is_ok)
+      VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT";
+    return is_ok;
   };
 
   framework::ir::SubGraphFuser fuser(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2799fb9e174d3..d4b680288e347 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1416,6 +1416,7 @@ USE_TRT_CONVERTER(elementwise_min_tensor);
 USE_TRT_CONVERTER(elementwise_pow_tensor);
 USE_TRT_CONVERTER(transpose);
 USE_TRT_CONVERTER(flatten);
+USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index a885b69fa7fbc..017caca6adc81 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,7 +3,7 @@ nv_library(tensorrt_converter
            SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
new file mode 100644
index 0000000000000..706814340a0e9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * flatten_contiguous_range trt converter
+ */
+class FlattenContiguousRangeOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    int dims = input->getDimensions().nbDims;
+    int start_axis = BOOST_GET_CONST(int, op_desc.GetAttr("start_axis"));
+    int stop_axis = BOOST_GET_CONST(int, op_desc.GetAttr("stop_axis"));
+
+    nvinfer1::IShuffleLayer* layer = nullptr;
+    if (!engine_->with_dynamic_shape()) {
+      if (start_axis < 0) start_axis += dims + 1;
+      if (stop_axis < 0) stop_axis += dims + 1;
+      int dim_prod = 1;
+      nvinfer1::Dims flatten_dim;
+      flatten_dim.nbDims = dims - (stop_axis - start_axis);
+      for (int i = 0, j = 0; i < dims; ++i) {
+        if (start_axis <= i + 1 && i + 1 <= stop_axis) {
+          int dim_i = input->getDimensions().d[i];
+          PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument(
+                                          "flatten_contiguous_range input dim "
+                                          "should be > 0, but got %d.",
+                                          dim_i));
+          dim_prod *= dim_i;
+          if (i + 1 == stop_axis) {
+            flatten_dim.d[j++] = dim_prod;
+          }
+        } else {
+          flatten_dim.d[j++] = input->getDimensions().d[i];
+        }
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setReshapeDimensions(flatten_dim);
+    } else {
+      if (start_axis < 0) start_axis += dims;
+      if (stop_axis < 0) stop_axis += dims;
+      auto* shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
+      auto* shape_layer_itensor = shape_layer->getOutput(0);
+
+      nvinfer1::Dims start_dim, size_dim, stride_dim;
+      start_dim.nbDims = 1;
+      size_dim.nbDims = 1;
+      stride_dim.nbDims = 1;
+      start_dim.d[0] = start_axis;
+      size_dim.d[0] = stop_axis - start_axis + 1;
+      stride_dim.d[0] = 1;
+      auto* slice_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Slice, *shape_layer_itensor, start_dim,
+                               size_dim, stride_dim);
+      uint32_t reduce_dim = 1;
+      auto* reduce_prod_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Reduce, *(slice_layer->getOutput(0)),
+          nvinfer1::ReduceOperation::kPROD, reduce_dim, true);
+
+      nvinfer1::ITensor* input_shape = nullptr;
+      if (start_axis == 0 && stop_axis == dims - 1) {
+        input_shape = reduce_prod_layer->getOutput(0);
+      } else {
+        std::vector<nvinfer1::ITensor*> itensors;
+        if (start_axis > 0) {
+          nvinfer1::Dims left_start_dim, left_size_dim, left_stride_dim;
+          left_start_dim.nbDims = 1;
+          left_size_dim.nbDims = 1;
+          left_stride_dim.nbDims = 1;
+          left_start_dim.d[0] = 0;
+          left_size_dim.d[0] = start_axis;
+          left_stride_dim.d[0] = 1;
+          auto* slice_layer_left = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, left_start_dim,
+              left_size_dim, left_stride_dim);
+          itensors.push_back(slice_layer_left->getOutput(0));
+        }
+        itensors.push_back(reduce_prod_layer->getOutput(0));
+        if (stop_axis < dims - 1) {
+          nvinfer1::Dims right_start_dim, right_size_dim, right_stride_dim;
+          right_start_dim.nbDims = 1;
+          right_size_dim.nbDims = 1;
+          right_stride_dim.nbDims = 1;
+          right_start_dim.d[0] = stop_axis + 1;
+          right_size_dim.d[0] = dims - stop_axis - 1;
+          right_stride_dim.d[0] = 1;
+          auto* slice_layer_right = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, right_start_dim,
+              right_size_dim, right_stride_dim);
+          itensors.push_back(slice_layer_right->getOutput(0));
+        }
+        auto* concat_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Concatenation, itensors.data(), itensors.size());
+        concat_layer->setAxis(0);
+        input_shape = concat_layer->getOutput(0);
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setInput(1, *input_shape);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "flatten_contiguous_range", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(flatten_contiguous_range,
+                          FlattenContiguousRangeOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index ddee4e0d682b0..6663103d4ca37 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -55,6 +55,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 // #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
+    teller_set.insert("flatten_contiguous_range");
 #endif
 #if CUDA_VERSION >= 10020
     teller_set.insert("reshape");
@@ -531,6 +532,37 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+    if (op_type == "flatten_contiguous_range") {
+      if (!with_dynamic_shape) {
+        int start_axis = BOOST_GET_CONST(int, desc.GetAttr("start_axis"));
+        int stop_axis = BOOST_GET_CONST(int, desc.GetAttr("stop_axis"));
+        auto x_var_name = desc.Input("X")[0];
+        auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                     "Developers need to check whether block_desc is passed in "
+                     "the pass.";
+          return false;
+        }
+        auto* x_var_desc = block->FindVar(x_var_name);
+        const auto x_shape = x_var_desc->GetShape();
+        int dims = x_shape.size();
+        if (start_axis < 0) start_axis += dims;
+        if (start_axis == 0) {
+          VLOG(3) << "TRT flatten_contiguous_range not support the "
+                     "batch-dimension being changed";
+          return false;
+        }
+        if (stop_axis < 0) stop_axis += dims;
+        for (int i = start_axis; i <= stop_axis; ++i) {
+          if (x_shape[i] < 0) {
+            VLOG(3) << "On TRT static shape,flatten_contiguous_range input dim "
+                       "should be > 0";
+            return false;
+          }
+        }
+      }
+    }
 
     if (op_type == "gather") {
       auto gather_inputs = desc.Inputs();
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
new file mode 100644
index 0000000000000..a4060349d4bed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertFlattenContiguousRangeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(batch):
+            return np.random.random([2, batch, 4, 8, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for start_axis in range(5):
+                for stop_axis in range(start_axis, 5):
+                    type = "flatten_contiguous_range"
+                    op_outputs = {
+                        "Out": ["output_data"],
+                        "XShape": ["xshape_data"]
+                    }
+                    ops_config = [{
+                        "op_type": type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": op_outputs,
+                        "op_attrs": {
+                            "start_axis": start_axis,
+                            "stop_axis": stop_axis,
+                        }
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch))
+                        },
+                        outputs=["output_data"])
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [2, 1, 4, 8, 3]}
+            self.dynamic_shape.max_input_shape = {"input_data": [2, 4, 4, 8, 3]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [2, 2, 4, 8, 3]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000:
+                if dynamic_shape:
+                    return 1, 2
+                else:
+                    if attrs[0]['start_axis'] == 0:
+                        return 0, 3
+                    else:
+                        return 1, 2
+            else:
+                return 0, 3
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From a88791481484ab6a61540a737336d79c65d021dc Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Sat, 15 Jan 2022 12:39:49 +0800
Subject: [PATCH 24/24] fix performance problem caused by Conj (#38939)

---
 paddle/pten/kernels/complex_kernel.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index b6074f117ea14..d12fc730fef87 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
@@ -23,7 +24,13 @@ namespace pten {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-template <typename T, typename Context>
+// If T is complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              std::is_same<T, paddle::platform::complex<float>>::value ||
+                  std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
@@ -31,4 +38,15 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return dense_out;
 }
 
+// If T is not complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              !std::is_same<T, paddle::platform::complex<float>>::value &&
+                  !std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
+DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
 }  // namespace pten