diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92bb2751c6433..0a9872771ff65 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,7 +26,7 @@ repos:
     -   id: sort-simple-yaml
         files: (api|backward|api_[a-z_]+)\.yaml$
     -   id: trailing-whitespace
-        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
+        files: (.*\.(py|bzl|md|rst|c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps|cmake)|BUILD|.*\.BUILD|WORKSPACE|CMakeLists.txt)$
 -   repo: local
     hooks:
     -   id: clang-format
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
old mode 100644
new mode 100755
index cd7b254892ed1..a0fc013a130a1
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -25,8 +25,8 @@ set(GLOO_LIBRARY_DIR
     "${GLOO_INSTALL_DIR}/lib"
     CACHE PATH "gloo library directory." FORCE)
 # As we add extra features for gloo, we use the non-official repo
-set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
-set(GLOO_TAG v0.0.2)
+set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git)
+set(GLOO_TAG v0.0.3)
 set(GLOO_LIBRARIES
     "${GLOO_INSTALL_DIR}/lib/libgloo.a"
     CACHE FILEPATH "gloo library." FORCE)
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 10b1686ddb85f..3db2464e59afd 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -134,24 +134,56 @@ class ProcessGroup {
         "ProcessGroup%s does not support send", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(
-      std::vector<phi::DenseTensor>& tensors, int) {  // NOLINT
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv", GetBackendName()));
   }
 
-  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
-                                                           int,
-                                                           int,
-                                                           int) {  // NOLINT
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support send", GetBackendName()));
+        "ProcessGroup%s does not support recv with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int,
+      int) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
-      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int,
+      int) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&, int, int, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 239114ae6188c..368008d9cc0ce 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -51,6 +51,17 @@ std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
       places, rank, comm_type, inputs);
 }
 
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs,
+    bool is_sync,
+    bool use_calc_stream) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(
+      places, rank, comm_type, inputs, is_sync, use_calc_stream);
+}
+
 ProcessGroupNCCL::NCCLTask::NCCLTask(
     const std::vector<Place>& places,
     int rank,
@@ -264,10 +275,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
 
   auto& nccl_comms = places_to_ncclcomm_[key];
 
-  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
 
-  auto task = std::make_shared<ProcessGroupNCCL::NCCLTask>(
-      places, rank_, comm_type, inputs, sync_op, use_calc_stream);
+  auto task =
+      CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream);
 
   platform::CUDADeviceGuard cuda_guard;
 
@@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
   cuda_guard.SetDevice(places[0]);
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<phi::DenseTensor>& tensors,
+    Fn fn,
+    int dst_rank,
+    CommType op_type,
+    bool sync_op,
+    bool use_calc_stream) {
+  const auto& places = GetPlaceList(tensors);
+  const auto& key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
+
+  auto task =
+      CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream);
+
+  platform::CUDADeviceGuard cuda_guard;
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+    }
+  }
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+
+  if (!use_calc_stream) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      task->control_events_[i].Record(*places_to_ctx_[key][i]);
+    }
+  }
+
+  return task;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
     std::vector<phi::DenseTensor>& tensors,
@@ -617,6 +702,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
@@ -640,6 +753,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
     phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
   // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
@@ -647,10 +788,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
 
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
-
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -671,16 +810,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
     phi::DenseTensor& tensors, int src_rank, int offset, int length) {
   // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
 
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
 
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -701,6 +873,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index e0e298e9113e9..0b8fa54cd337e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream {
              int rank,
              CommType comm_type,
              const std::vector<phi::DenseTensor>& inputs,
-             bool is_sync,
+             bool sync_op,
              bool use_calc_stream);
 
     bool IsCompleted();
@@ -122,19 +122,47 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   std::shared_ptr<ProcessGroup::Task> Send(
       std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Recv(
       std::vector<phi::DenseTensor>& tensors, int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
                                                    int dst_rank,
                                                    int offset,
                                                    int length) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
                                                    int src_rank,
                                                    int offset,
                                                    int length) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
@@ -180,9 +208,17 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
       int rank,
-      CommType opType,
+      CommType op_type,
       const std::vector<phi::DenseTensor>& inputs);
 
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      const std::vector<Place>& places,
+      int rank,
+      CommType op_type,
+      const std::vector<phi::DenseTensor>& inputs,
+      bool sync_op,
+      bool use_calc_stream);
+
  protected:
   std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
@@ -233,6 +269,15 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       int dst_rank,
       CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      Fn fn,
+      int dst_rank,
+      CommType op_type,
+      bool sync_op,
+      bool use_calc_stream);
+
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
index 9a20b8e6eaf79..51c8fe7bd9b1b 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -45,5 +45,89 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
       "ProcessGroup%s does not support do allreduce", GetBackendName()));
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors, int dst_rank, bool sync_op) {
+  return Send(tensors,
+              dst_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op) {
+  return Send_Partial(tensors,
+                      dst_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send_partial", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors, int src_rank, bool sync_op) {
+  return Recv(tensors,
+              src_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op) {
+  return Recv_Partial(tensors,
+                      src_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int offset,
+    int length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv_partial", GetBackendName()));
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h
index 81a05ee2416e0..4cd17ac72562e 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -66,6 +66,58 @@ class ProcessGroupStream : public ProcessGroup {
       const AllreduceOptions& options,
       bool sync_op,
       bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int offset,
+      int length,
+      bool sync_op,
+      bool use_calc_stream);
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
index e5cfe838c54f3..7d99a80eaeea8 100644
--- a/paddle/fluid/eager/CMakeLists.txt
+++ b/paddle/fluid/eager/CMakeLists.txt
@@ -14,7 +14,8 @@ set(eager_deps
     grad_node_info
     grad_tensor_holder
     accumulation_node
-    custom_operator_node)
+    custom_operator_node
+    python)
 
 set(fluid_deps
     tracer
@@ -77,6 +78,10 @@ cc_library(
        autograd_meta
        hook_utils)
 
+cc_library(
+  saved_tensors_hooks
+  SRCS saved_tensors_hooks.cc
+  DEPS hook_utils)
 if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
   add_subdirectory(tests)
 endif()
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 0017dba79742e..12bbfbbb25d2b 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -16,6 +16,7 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -89,7 +90,7 @@ GradNodeAccumulation::operator()(
                          kSlotSmallVectorSize>& grads,  // NOLINT
     bool create_graph,
     bool is_new_grad) {
-  VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
+  VLOG(3) << "Running AD API Grad: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
                      "GradNodeAccumulation should take exactly 1 grad tensor"
@@ -122,7 +123,22 @@ GradNodeAccumulation::operator()(
   if (ReduceHooksRegistered()) {
     ApplyReduceHooks();
   }
+  VLOG(3) << "Finish AD API Grad: GradNodeAccumulation";
+  if (VLOG_IS_ON(4)) {
+    const char* INPUT_PRINT_TEMPLATE = "{ Input: [%s], Output: [%s] } ";
 
+    std::string input_str = "";
+    std::string output_str = "";
+    const char* TENSOR_OUT_GRAD_TEMPLATE = "(grads[0][0], [%s]), ";
+    std::string input_out_grad_str = paddle::string::Sprintf(
+        TENSOR_OUT_GRAD_TEMPLATE, egr::EagerUtils::TensorStr(grads[0][0]));
+    const char* TENSOR_X_GRAD_TEMPLATE = "(grad_out, [%s]), ";
+    std::string output_x_grad_str = paddle::string::Sprintf(
+        TENSOR_X_GRAD_TEMPLATE, egr::EagerUtils::TensorStr(grad_out));
+    output_str += output_x_grad_str;
+    VLOG(4) << paddle::string::Sprintf(
+        INPUT_PRINT_TEMPLATE, input_str, output_str);
+  }
   return {{grad_out}};
 }
 
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 8dbc2872ca2bb..f8e2c4327e142 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,7 +24,7 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
-    VLOG(6) << "Construct GradNodeAccumulation";
+    VLOG(5) << "Construct GradNodeAccumulation";
     if (meta) {
       weak_grad_ = meta->WeakGrad();
     }
@@ -33,7 +33,7 @@ class GradNodeAccumulation : public GradNodeBase {
   }
 
   ~GradNodeAccumulation() override {
-    VLOG(6) << "Destruct GradNodeAccumulation";
+    VLOG(5) << "Destruct GradNodeAccumulation";
   }
 
   // Functor: perform backward computations
@@ -44,7 +44,7 @@ class GradNodeAccumulation : public GradNodeBase {
              bool create_graph = false,
              bool is_new_grad = false) override;
 
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override { VLOG(5) << "Do nothing here now"; }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
index 49d401b92303e..bc970f4e2d859 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -16,10 +16,10 @@
 
 #include "paddle/phi/api/include/tensor.h"
 
-paddle::experimental::Tensor add_n_dygraph_function(
+paddle::experimental::Tensor add_n_ad_func(
     const std::vector<paddle::experimental::Tensor>& x);
 
-paddle::experimental::Tensor conv2d_dygraph_function(
+paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index 3081eaf3584f6..fc42340211310 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -23,7 +23,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor add_n_dygraph_function(
+paddle::experimental::Tensor add_n_ad_func(
     const std::vector<paddle::experimental::Tensor>& x) {
   // Dygraph Record Event
   paddle::platform::RecordEvent dygraph_entrance_record_event(
@@ -46,7 +46,7 @@ paddle::experimental::Tensor add_n_dygraph_function(
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return add_n_dygraph_function(NEW_x);
+      return add_n_ad_func(NEW_x);
     }
   }
 
@@ -56,7 +56,7 @@ paddle::experimental::Tensor add_n_dygraph_function(
   std::vector<egr::AutogradMeta*>* x_autograd_meta = &x_autograd_meta_vec;
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "add_n_dygraph_function";
+          << "add_n_ad_func";
   auto api_result = paddle::experimental::add_n(x);
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index 3e2e67297834d..5e221d3f07f6b 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -24,7 +24,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor conv2d_dygraph_function(
+paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
@@ -60,17 +60,17 @@ paddle::experimental::Tensor conv2d_dygraph_function(
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return conv2d_dygraph_function(NEW_input,
-                                     NEW_filter,
-                                     strides,
-                                     paddings,
-                                     paddding_algorithm,
-                                     groups,
-                                     dilations,
-                                     data_format,
-                                     use_addto,
-                                     workspace_size_MB,
-                                     exhaustive_search);
+      return conv2d_ad_func(NEW_input,
+                            NEW_filter,
+                            strides,
+                            paddings,
+                            paddding_algorithm,
+                            groups,
+                            dilations,
+                            data_format,
+                            use_addto,
+                            workspace_size_MB,
+                            exhaustive_search);
     }
   }
 
@@ -89,17 +89,17 @@ paddle::experimental::Tensor conv2d_dygraph_function(
     bool is_enable_tune =
         paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune();
     paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
-    auto out = conv2d_dygraph_function(NEW_input,
-                                       filter,
-                                       strides,
-                                       paddings,
-                                       paddding_algorithm,
-                                       groups,
-                                       dilations,
-                                       data_format,
-                                       use_addto,
-                                       workspace_size_MB,
-                                       exhaustive_search);
+    auto out = conv2d_ad_func(NEW_input,
+                              filter,
+                              strides,
+                              paddings,
+                              paddding_algorithm,
+                              groups,
+                              dilations,
+                              data_format,
+                              use_addto,
+                              workspace_size_MB,
+                              exhaustive_search);
     transformer->SetOutTensorLayout(&out);
     if (is_enable_tune) {
       paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
@@ -115,7 +115,7 @@ paddle::experimental::Tensor conv2d_dygraph_function(
       egr::EagerUtils::nullable_autograd_meta(filter);
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "conv2d_dygraph_function";
+          << "conv2d_ad_func";
   auto api_result = paddle::experimental::conv2d(input,
                                                  filter,
                                                  strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
index b0dc4f59ffda5..6f7a34094b19d 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
@@ -64,8 +64,7 @@ AddNGradNodeFinal::operator()(
 
   // dygraph function
   for (size_t i = 0; i < returns[0].size(); i++) {
-    returns[0][i] =
-        ::scale_dygraph_function(out_grad, phi::Scalar(1.0), 0.0, true);
+    returns[0][i] = ::scale_ad_func(out_grad, phi::Scalar(1.0), 0.0, true);
   }
 
   // Check NaN and Inf id needed
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
index ea1bc2271c194..d733dbf8b7c28 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -531,7 +531,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
       grad_node->SetGradInMeta(Y, 19);
       egr::EagerUtils::CheckAndRetainGrad(Y);
-
       auto QKVOut_accumulation_node =
           std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKVOut);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVOut, 0);
diff --git a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
index 9022e800905d0..fcc66893a7164 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -161,11 +161,24 @@ def str2Hump(text):
     string = str2Hump(string)
     if string.rfind("Grad") == (len(string) - 4):
         string = string[:-4]
-    return f"{string}GradNodeFinal"
+    return f"{string}GradNode"
 
 
 def GetDygraphForwardFunctionName(string):
-    return f"{string}_dygraph_function"
+    return f"{string}_ad_func"
+
+
+def GetDygraphLogName(string):
+
+    def str2Hump(text):
+        arr = filter(None, text.split('_'))
+        res = ''
+        for i in arr:
+            res = res + i[0].upper() + i[1:]
+        return res
+
+    string = str2Hump(string)
+    return string
 
 
 def GetIntermediateAPIFunctionName(string):
@@ -198,7 +211,7 @@ def GetInplacedFunctionName(function_name):
 
 
 def GetForwardFunctionName(string):
-    return f"{string}_dygraph_function"
+    return f"{string}_ad_func"
 
 
 def GetIndent(num):
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index d16ed62f5bf24..e22c63225813e 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -23,7 +23,7 @@
 from codegen_utils import FindGradName, FindForwardName, GetSavedName, GetGradNodeName
 from codegen_utils import IsPlainTensorType, IsVectorTensorType
 from codegen_utils import GetConstReference, RemoveConstAndReference
-from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName
+from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName, GetDygraphLogName
 from codegen_utils import GetAutoGradMetaName, GetAutoGradMetaVectorName
 from codegen_utils import RemoveSpecialSymbolsInName, RecoverBaseNameOfInplaceFunction
 from codegen_utils import GetInplacedFunctionName
@@ -150,6 +150,7 @@ class {} : public egr::GradNodeBase {{
 GRAD_FUNCTION_TEMPLATE = \
 """
 paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{
+  VLOG(3) << \"Running AD API GRAD: \" << \"{}\";
   // Fill Zero For GradIn Tensors
 {}
   // Apply Gradient Hooks
@@ -166,7 +167,7 @@ class {} : public egr::GradNodeBase {{
   // Inplace Strategy
 {}
   // Call grad_api function
-  VLOG(3) << \"Final State Running: {}\";
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
 {}
   // Check NaN and Inf id needed
 {}
@@ -174,6 +175,9 @@ class {} : public egr::GradNodeBase {{
 {}
   // Create Grad Node
 {}
+  VLOG(4) << \"Finish AD API GRAD: {}";
+  // LOG IF DEBUG
+  {}
   // Return
 {}
 }}
@@ -182,6 +186,7 @@ class {} : public egr::GradNodeBase {{
 FORWARD_FUNCTION_TEMPLATE = \
 """
 {} {}({}) {{
+  VLOG(3) << \"Running AD API: \" << \"{}\";
   // Dygraph Record Event
 {}
   // AMP Logic
@@ -191,7 +196,7 @@ class {} : public egr::GradNodeBase {{
   // Get Input AutoGradMeta
 {}
   // Forward API Call
-  VLOG(3) << \"Final State Running: \" << \"{}\";
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
 {}
   // Check NaN and Inf if needed
 {}
@@ -206,15 +211,29 @@ class {} : public egr::GradNodeBase {{
 {}{}
   // Node Creation
 {}
+
+  VLOG(4) << \"Finish AD API: {}";
+  // LOG IF DEBUG
+  {}
   // Returns
   return {};
 }}
 """
 
+LOG_PRINT_TEMPLATE = \
+"""
+  if(VLOG_IS_ON(4)){{
+      const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s],  Output: [%s] }} \";
+      {}
+      VLOG(4) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str, output_str);
+  }}
+"""
+
 
 FORWARD_ONLY_FUNCTION_TEMPLATE = \
 """
 {} {}({}) {{
+  VLOG(3) << \"Running AD API: \" << \"{}\";
   // Dygraph Record Event
 {}
   // AMP Logic
@@ -222,11 +241,13 @@ class {} : public egr::GradNodeBase {{
   // Layout autotune
 {}
   // Forward API Call
-  VLOG(3) << \"Final State Running: \" << \"{}\";
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
 {}
   // Get Outputs
 {}
-
+  VLOG(4) << \"Finish AD API: {}";
+  // LOG IF DEBUG
+  {}
   // Returns
   return {};
 }}
@@ -867,7 +888,7 @@ def GenerateNodeCreationCodes(self, for_backward=False):
             set_grad_out_meta_list.append(set_grad_out_meta)
         set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
 
-        # SetOutRank & SetHistory & SetGradInMeta & CheckAndRetainGrad
+        # SetOutRank & SetHistory & SetGradInMeta
         set_out_rank_list = []
         set_history_list = []
         set_grad_in_meta_list = []
@@ -885,7 +906,6 @@ def GenerateNodeCreationCodes(self, for_backward=False):
 
             set_grad_in_meta = f"{indent}grad_node->SetGradInMeta({name}, {pos});"
             set_retain_grad = f"{indent}egr::EagerUtils::CheckAndRetainGrad({name});"
-
             set_out_rank_list.append(set_out_rank)
             set_history_list.append(set_history)
             set_grad_in_meta_list.append(set_grad_in_meta)
@@ -1294,7 +1314,8 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             node_creation_str = self.node_creation_str
 
         dygraph_event_str = f"{indent}paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);\n"
-        forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
+        forward_ad_function_name = GetDygraphForwardFunctionName(
+            forward_api_name)
 
         # Forward amp logic
         kernel_trans2_op_name_str = f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");"
@@ -1307,9 +1328,10 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             amp_autocast_list) + "    " + "    ".join(
                 amp_autocast_optional_list)
         amp_inputs_call_args_str = ", ".join(amp_inputs_call_list)
-        amp_call_str = f"return {forward_function_name}({amp_inputs_call_args_str});"
+        amp_call_str = f"return {forward_ad_function_name}({amp_inputs_call_args_str});"
         if is_inplaced or (forward_api_name == "cast"):
-            amp_logic_str = ""
+            amp_logic_str = "\n VLOG(5) << \" No AMP for {} because it is a inplace or cast api. \"; ".format(
+                forward_ad_function_name)
         else:
             amp_logic_str = AMP_LOGIC_TEMPLATE.format(
                 kernel_trans2_op_name_str, amp_tensors_vector_list_str,
@@ -1335,8 +1357,8 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 layout_autotune_attr) == 0:
             layout_logic_str = ""
         else:
-            # after_call_str = f"return {forward_function_name}({layout_inputs_call_args_str});\n"
-            after_call_str = f"auto api_result = {forward_function_name}({layout_inputs_call_args_str});\n"
+            # after_call_str = f"return {forward_ad_function_name}({layout_inputs_call_args_str});\n"
+            after_call_str = f"auto api_result = {forward_ad_function_name}({layout_inputs_call_args_str});\n"
             layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format(
                 amp_tensors_vector_list_str,
                 "    ".join(layout_tensors_vector_optional_list),
@@ -1345,26 +1367,45 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 "   ".join(layout_autotune_optional_list), after_call_str,
                 layout_autotune_outs_list, returns_str)
 
+        # For inputs outputs prepare for logging
+        var_str = f"\n{indent}  std::string input_str = \"\";"
+        var_str += f"\n{indent}  std::string output_str = \"\";"
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \"({name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
+            var_str += f"\n{indent}  input_str += input_{name}_str; "
+        for name, (ttype, pos) in forward_outputs_position_map.items():
+            var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \"({name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string output_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
+            var_str += f"\n{indent}  output_str += output_{name}_str; "
+
+        log_str = LOG_PRINT_TEMPLATE.format(var_str)
+
         # Generate forward_definition_str and forward_declaration_str
         if self.is_forward_only:
             if len(amp_tensors_vector_list) == 0:
-                amp_logic_str = ""
+                amp_logic_str = "\n VLOG(7) << \" No AMP for {} because it has no input. \"; ".format(
+                    forward_ad_function_name)
             self.forward_definition_str += FORWARD_ONLY_FUNCTION_TEMPLATE.format(
-                returns_type_str, forward_function_name,
-                inputs_args_definition_str, dygraph_event_str, amp_logic_str,
-                layout_logic_str, forward_function_name, forward_call_str,
-                get_outputs_str, returns_str)
+                returns_type_str,
+                forward_ad_function_name, inputs_args_definition_str,
+                GetDygraphLogName(forward_api_name), dygraph_event_str,
+                amp_logic_str, layout_logic_str, forward_api_name,
+                forward_call_str, get_outputs_str, forward_ad_function_name,
+                log_str, returns_str)
         else:
             self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
-                returns_type_str, forward_function_name,
-                inputs_args_definition_str, dygraph_event_str, amp_logic_str,
-                layout_logic_str, inputs_autograd_meta_str,
-                forward_function_name, forward_call_str, check_nan_inf_str,
+                returns_type_str,
+                forward_ad_function_name, inputs_args_definition_str,
+                GetDygraphLogName(forward_api_name), dygraph_event_str,
+                amp_logic_str, layout_logic_str, inputs_autograd_meta_str,
+                forward_api_name, forward_call_str, check_nan_inf_str,
                 get_outputs_str, outputs_autograd_meta_str,
                 compute_require_grad_args_str, check_inplace_str,
-                bump_inplace_version_str, node_creation_str, returns_str)
+                bump_inplace_version_str, node_creation_str,
+                forward_ad_function_name, log_str, returns_str)
 
-        self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
+        self.forward_declaration_str += f"{returns_type_str} {forward_ad_function_name}({inputs_args_declaration_str});\n"
 
     def GenerateInplacedForwardDygraphFunctions(self):
         # Inplaced Version Dygraph Function Generation
@@ -1770,7 +1811,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             forward_api_name = self.grad_api_contents['invoke'].split(
                 '(')[0].strip()
             autograd_api = self.grad_api_contents['invoke'].replace(
-                forward_api_name, forward_api_name + '_dygraph_function', 1)
+                forward_api_name,
+                GetDygraphForwardFunctionName(forward_api_name), 1)
             grad_function_call_str = f"""
   if (trace_backward) {{
   {indent}{autograd_api_out} api_output = {autograd_api};
@@ -1839,13 +1881,40 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(self.backward_api_name)
+        # For inputs outputs prepare for logging
+        var_str = f"\n{indent}  std::string input_str = \"\";"
+        var_str += f"\n{indent}  std::string output_str = \"\";"
+        for name, (ttype, fwd_position,
+                   grad_api_position) in backward_grad_inputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+
+        for name, (backward_input_type, is_fwd_input,
+                   grad_api_position), in backward_forward_inputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+
+        for name, (ttype, fwd_position,
+                   grad_api_position) in backward_grad_outputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string output_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  output_str += output_{new_name}_str; "
+
+        log_str = LOG_PRINT_TEMPLATE.format(var_str)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
-            grad_node_name, fill_zero_str, get_grad_in_args_str,
-            grad_function_prepare_str, compute_require_next_grad_str,
-            inplace_check_str, inplace_for_grad_outs_str, grad_node_name,
+            grad_node_name, GetDygraphLogName(self.backward_api_name),
+            fill_zero_str, get_grad_in_args_str, grad_function_prepare_str,
+            compute_require_next_grad_str, inplace_check_str,
+            inplace_for_grad_outs_str, self.backward_api_name,
             grad_function_call_str, check_nan_inf_str,
-            outputs_autograd_meta_str, next_grad_node_creation_str, returns_str)
+            outputs_autograd_meta_str, next_grad_node_creation_str,
+            GetDygraphLogName(self.backward_api_name), log_str, returns_str)
 
     def run(self):
         super().run()
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index b70ec78c7598c..04541d082c435 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -133,7 +133,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
     if (auto_grad_meta == nullptr) {
-      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
                  "stop_gradient=True: "
               << tensor.name();
       continue;
@@ -141,14 +141,14 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     // Get grad input info from target tensors
     auto input_info = auto_grad_meta->OutRankInfo();
 
-    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
+    VLOG(5) << "Out Rank of Tensor is slot: " << input_info.first
             << ", rank: " << input_info.second;
     // Get target GradNodeBase from target tensors
     auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
 
     if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
         auto_grad_meta->StopGradient()) {
-      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
                  "stop_gradient=True: "
               << tensor.name();
       continue;
@@ -169,7 +169,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
-      VLOG(6) << "Create Value for grad input tensor " << i
+      VLOG(5) << "Create Value for grad input tensor " << i
               << " of grad node: " << grad_node->name();
       node_input_buffers_dict[grad_node] =
           std::make_unique<GradTensorHolder>(grad_node->InputMeta());
@@ -184,13 +184,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
               "grad_tensors should either have "
               "size = 0 or same size as tensors."));
       // Feed given tensor if it's provided
-      VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
+      VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
 
       // Deep copy
       node_input_buffers_dict[grad_node]->CopyValueFromTensor(
           input_info.first, input_info.second, grad_tensors[i]);
     } else {
-      VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
+      VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
       // Initialize tensor with 1.0
       // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
       // dims
@@ -210,12 +210,12 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         inputs, no_grad_vars, orig_queue, &queue, node_input_buffers_dict);
   }
 
-  VLOG(6) << "Update In degree Map for backward";
+  VLOG(5) << "Update In degree Map for backward";
   // 3. Compute in_degree for each node
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
-  VLOG(3) << "Startup_ops's size is " << queue.size();
+  VLOG(5) << "Startup_ops's size is " << queue.size();
 
   /* --- Topological Visit --- */
   // 1. Pop queue
@@ -224,11 +224,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
-  VLOG(3) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
-    VLOG(3) << "Running GradNode:" << node->name() << " addr:" << node;
-
+    VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
+    VLOG(4) << EagerUtils::GradNodeStr(*node);
     paddle::platform::RecordEvent node_record_event(
         std::string((*node).name()),
         paddle::platform::TracerEventType::Operator,
@@ -255,7 +254,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     // Check input
     EnforceGradNodeHasInput(node);
 
-    VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
+    VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>
@@ -269,7 +268,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     // retain_grad or not
     if (!retain_graph) {
-      VLOG(6)
+      VLOG(3)
           << "retain_graph is false, need to clear the TensorWrapper of nodes.";
       node->ClearTensorWrappers();
     }
@@ -322,11 +321,11 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
         if ((!grad_output_tensor.defined() ||
              !grad_output_tensor.initialized())) {
-          VLOG(6) << "We get grad_output_tensor with slot: " << i
+          VLOG(7) << "We get grad_output_tensor with slot: " << i
                   << ", rank: " << j << " as uninitialized or undefined tensor";
         }
 
-        VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
+        VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
                 << " 's name is: " << grad_output_tensor.name();
 
@@ -335,12 +334,12 @@ std::vector<paddle::experimental::Tensor> RunBackward(
           const auto& input_meta = next_node->InputMeta();
           auto grad_tensor_holder =
               std::make_unique<GradTensorHolder>(input_meta);
-          VLOG(6) << "Construct GradTensorHolder for grad node: "
+          VLOG(7) << "Construct GradTensorHolder for grad node: "
                   << next_node->name();
           node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
 
-        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
+        VLOG(3) << "Sum grad inputs for edge slot: " << edge_rank.first
                 << ", rank: " << edge_rank.second;
 
         node_input_buffers_dict[next_node]->add(edge_rank.first,
@@ -350,7 +349,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
         // Update queue
         node_in_degree_map[next_node]--;
-        VLOG(6) << next_node->name()
+        VLOG(7) << next_node->name()
                 << " ref_cnt is: " << node_in_degree_map[next_node];
 
         PADDLE_ENFORCE(
@@ -382,7 +381,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     }
   }
 
-  VLOG(6) << "Run Backward Final hook size: "
+  VLOG(7) << "Run Backward Final hook size: "
           << egr::Controller::Instance().FinalBackwardHooks().size();
   for (auto& hook : egr::Controller::Instance().FinalBackwardHooks()) {
     (*hook)();
@@ -390,6 +389,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   egr::Controller::Instance().ClearFinalBackwardHooks();
   if (!is_general_grad) return {};
   return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
+  VLOG(3) << "Finish Backward";
 }
 
 void Backward(
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index 4ebc2860c59d9..118c8be861122 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -45,7 +45,7 @@ inline paddle::experimental::Tensor Cast(
     const bool trace_backward = true) {
   if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) {
     if (trace_backward) {
-      return sparse::cast_dygraph_function(
+      return sparse::cast_ad_func(
           input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
     } else {
       return paddle::experimental::sparse::cast(
@@ -53,7 +53,7 @@ inline paddle::experimental::Tensor Cast(
     }
   } else {
     if (trace_backward) {
-      return cast_dygraph_function(input, dst_dtype);
+      return cast_ad_func(input, dst_dtype);
     } else {
       return paddle::experimental::cast(input, dst_dtype);
     }
diff --git a/paddle/fluid/eager/eager_layout_transformer.h b/paddle/fluid/eager/eager_layout_transformer.h
index 3f2717be6bef5..d0cb9c481243b 100644
--- a/paddle/fluid/eager/eager_layout_transformer.h
+++ b/paddle/fluid/eager/eager_layout_transformer.h
@@ -35,7 +35,7 @@ inline paddle::experimental::Tensor EagerTraceTransposeOp(
   } else {
     axis = {0, 1, 2, 3};
   }
-  auto out_tensor = transpose_dygraph_function(in, axis);
+  auto out_tensor = transpose_ad_func(in, axis);
   VLOG(4) << "AutoTune Transpose from "
           << paddle::framework::DataLayoutToString(in.layout()) << " to "
           << paddle::framework::DataLayoutToString(layout);
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 0e102d0d1bc60..afa8a6f205259 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -41,7 +41,7 @@ static void CheckTensor(const paddle::experimental::Tensor& pre,
         "The tensor in before and after hook are not consistent"));
   }
   if (pre.initialized() && post.initialized()) {
-    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
+    VLOG(7) << paddle::framework::DataType2String(pre.dtype()) << " "
             << paddle::framework::DataType2String(post.dtype());
     PADDLE_ENFORCE_EQ(
         pre.dtype(),
@@ -62,7 +62,7 @@ static void CheckTensor(const paddle::experimental::Tensor& pre,
 }
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
-  VLOG(6) << "Construct GradNodeBase";
+  VLOG(7) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
 }
@@ -84,7 +84,7 @@ GradNodeBase::MutableOutputMeta() {
 
 void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
-  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  VLOG(7) << "Set GradSlotMeta for Grad Inputs";
   auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank,
@@ -104,7 +104,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   }
 
   if (!fwd_out.initialized()) {
-    VLOG(6)
+    VLOG(7)
         << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
     return;
   }
@@ -123,7 +123,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
         static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
     dense_tensor = csr_tensor->mutable_non_zero_elements();
   } else {
-    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
                "non-DenseTensor argument.";
   }
   PADDLE_ENFORCE_NE(
@@ -145,7 +145,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
 void GradNodeBase::SetGradInMeta(
     const std::vector<paddle::experimental::Tensor>& fwd_out,
     size_t slot_rank) {
-  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  VLOG(7) << "Set GradSlotMeta for Grad Inputs";
   size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank,
@@ -177,7 +177,7 @@ void GradNodeBase::SetGradInMeta(
     }
 
     if (!fwd_out_tensor.initialized()) {
-      VLOG(6)
+      VLOG(7)
           << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
       return;
     }
@@ -202,7 +202,7 @@ void GradNodeBase::SetGradInMeta(
         need_complex_to_real_ = true;
       }
     } else {
-      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+      VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
                  "with non-DenseTensor argument.";
     }
   }
@@ -260,7 +260,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
       meta.SetPlace(fwd_in.place());
     }
   } else {
-    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
                "non-DenseTensor argument.";
   }
 }
@@ -319,7 +319,7 @@ void GradNodeBase::SetGradOutMeta(
         meta.SetPlace(fwd_in_tensor.place());
       }
     } else {
-      VLOG(6)
+      VLOG(7)
           << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
              "non-DenseTensor argument.";
     }
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index ebe1f6cccf93d..650446401468f 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -74,7 +74,7 @@ class Edge {
   }
 
   void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
-    VLOG(6) << "Reseting Edge's Grad Node";
+    VLOG(7) << "Reseting Edge's Grad Node";
     grad_node_ = node;
   }
 
@@ -167,10 +167,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
+  GradNodeBase() { VLOG(7) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
+  virtual ~GradNodeBase() { VLOG(7) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
@@ -255,14 +255,14 @@ class GradNodeBase {
 
   std::map<int64_t, std::tuple<size_t, size_t, std::shared_ptr<TensorHook>>>
   GetGradientHookFuntions() {
-    VLOG(6) << "GetGradientHookFuntions ";
+    VLOG(7) << "GetGradientHookFuntions ";
     return gradient_hooks_;
   }
 
   void SetGradientHookFuntions(
       std::map<int64_t, std::tuple<size_t, size_t, std::shared_ptr<TensorHook>>>
           hooks) {
-    VLOG(6) << "SetGradientHookFuntions ";
+    VLOG(7) << "SetGradientHookFuntions ";
     gradient_hooks_ = hooks;
   }
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index afd9e4ef865ff..14a8c26f9dcb8 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -143,7 +143,7 @@ void GradTensorHolder::add(size_t slot_id,
     if (t.is_dense_tensor()) {
       if (buffer_tensor.is_dense_tensor()) {
         if (create_graph || t.is_custom_device()) {
-          buffer_tensor = add_dygraph_function(t, buffer_tensor);
+          buffer_tensor = add_ad_func(t, buffer_tensor);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t, &buffer_tensor);
@@ -170,7 +170,7 @@ void GradTensorHolder::add(size_t slot_id,
             std::make_shared<phi::DenseTensor>(
                 buffer_sparse->non_zero_elements()));
         if (create_graph || t.is_custom_device()) {
-          buffer_values = add_dygraph_function(t_values, buffer_values);
+          buffer_values = add_ad_func(t_values, buffer_values);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t_values, &buffer_values);
diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h
index 064c96bff380b..f501c4acc6210 100644
--- a/paddle/fluid/eager/hooks.h
+++ b/paddle/fluid/eager/hooks.h
@@ -62,4 +62,18 @@ class CppVoidHook : public VoidHook {
   std::function<void()> fn_;
 };
 
+class PackHookBase {
+ public:
+  virtual ~PackHookBase() = default;
+  virtual void* operator()(const paddle::experimental::Tensor& tensor) = 0;
+  virtual void* operator()(void* py_tensor) = 0;
+};
+
+class UnPackHookBase {
+ public:
+  virtual ~UnPackHookBase() = default;
+  virtual paddle::experimental::Tensor operator()(void* packed_value) = 0;
+  virtual void* operator()(void* packed_value, void* other) = 0;
+};
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/saved_tensors_hooks.cc b/paddle/fluid/eager/saved_tensors_hooks.cc
new file mode 100644
index 0000000000000..6bd62c21611c0
--- /dev/null
+++ b/paddle/fluid/eager/saved_tensors_hooks.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/saved_tensors_hooks.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+
+#if !(defined(PADDLE_NO_PYTHON) && defined(PADDLE_ON_INFERENCE))
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#endif
+
+namespace egr {
+#if !(defined(PADDLE_NO_PYTHON) && defined(PADDLE_ON_INFERENCE))
+PackHook::PackHook(PyObject* hook) : hook_(hook) { Py_INCREF(hook_); }
+
+PackHook::~PackHook() {
+  ::pybind11::gil_scoped_acquire gil;
+  Py_DECREF(hook_);
+}
+
+void* PackHook::operator()(const paddle::experimental::Tensor& tensor) {
+  bool grad_tmp = egr::Controller::Instance().HasGrad();
+  egr::Controller::Instance().SetHasGrad(false);
+  ::pybind11::gil_scoped_acquire gil;
+  auto args = PyTuple_New(1);
+  PyTuple_SET_ITEM(args, 0, paddle::pybind::ToPyObject(tensor));
+  PyObject* ret = PyObject_Call(hook_, args, nullptr);
+  Py_XDECREF(args);
+  egr::Controller::Instance().SetHasGrad(grad_tmp);
+  return reinterpret_cast<void*>(ret);
+}
+
+void* PackHook::operator()(void* py_tensor) {
+  bool grad_tmp = egr::Controller::Instance().HasGrad();
+  egr::Controller::Instance().SetHasGrad(false);
+  ::pybind11::gil_scoped_acquire gil;
+  auto args = PyTuple_New(1);
+  Py_INCREF(reinterpret_cast<PyObject*>(py_tensor));
+  PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(py_tensor));
+  PyObject* ret = PyObject_Call(hook_, args, nullptr);
+  Py_XDECREF(args);
+  egr::Controller::Instance().SetHasGrad(grad_tmp);
+  return reinterpret_cast<void*>(ret);
+}
+
+UnPackHook::UnPackHook(PyObject* hook) : hook_(hook) { Py_INCREF(hook_); }
+
+UnPackHook::~UnPackHook() {
+  ::pybind11::gil_scoped_acquire gil;
+  Py_DECREF(hook_);
+}
+
+paddle::experimental::Tensor UnPackHook::operator()(void* packed_value) {
+  bool grad_tmp = egr::Controller::Instance().HasGrad();
+  egr::Controller::Instance().SetHasGrad(false);
+  ::pybind11::gil_scoped_acquire gil;
+  auto args = PyTuple_New(1);
+  Py_INCREF(reinterpret_cast<PyObject*>(packed_value));
+  PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(packed_value));
+  PyObject* ret = PyObject_Call(hook_, args, nullptr);
+  Py_XDECREF(args);
+  egr::Controller::Instance().SetHasGrad(grad_tmp);
+
+  PADDLE_ENFORCE_EQ(paddle::pybind::IsEagerTensor(ret),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "paddle.autograd.saved_tensors_hooks only one pair "
+                        "of hooks is allowed at a time."));
+
+  auto tensor = reinterpret_cast<paddle::pybind::TensorObject*>(ret)->tensor;
+  Py_XDECREF(ret);
+  return tensor;
+}
+
+void* UnPackHook::operator()(void* packed_value, void* other) {
+  bool grad_tmp = egr::Controller::Instance().HasGrad();
+  egr::Controller::Instance().SetHasGrad(false);
+  ::pybind11::gil_scoped_acquire gil;
+  auto args = PyTuple_New(1);
+  Py_INCREF(reinterpret_cast<PyObject*>(packed_value));
+  PyTuple_SET_ITEM(args, 0, reinterpret_cast<PyObject*>(packed_value));
+  PyObject* ret = PyObject_Call(hook_, args, nullptr);
+  Py_XDECREF(args);
+  egr::Controller::Instance().SetHasGrad(grad_tmp);
+
+  PADDLE_ENFORCE_EQ(paddle::pybind::IsEagerTensor(ret),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "paddle.autograd.saved_tensors_hooks only one pair "
+                        "of hooks is allowed at a time."));
+
+  return reinterpret_cast<void*>(ret);
+}
+#endif
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/saved_tensors_hooks.h b/paddle/fluid/eager/saved_tensors_hooks.h
new file mode 100644
index 0000000000000..1deb30daaa8e1
--- /dev/null
+++ b/paddle/fluid/eager/saved_tensors_hooks.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/hooks.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+
+namespace egr {
+#if !(defined(PADDLE_NO_PYTHON) && defined(PADDLE_ON_INFERENCE))
+class PackHook : public PackHookBase {
+ public:
+  explicit PackHook(PyObject* hook);
+
+  ~PackHook();
+
+  void* operator()(const paddle::experimental::Tensor& tensor) override;
+
+  void* operator()(void* py_tensor) override;
+
+ private:
+  PyObject* hook_;
+};
+
+class UnPackHook : public UnPackHookBase {
+ public:
+  explicit UnPackHook(PyObject* hook);
+
+  ~UnPackHook();
+
+  paddle::experimental::Tensor operator()(void* packed_value) override;
+
+  void* operator()(void* packed_value, void* other) override;
+
+ private:
+  PyObject* hook_;
+};
+#endif
+
+class SavedTensorsHooks {
+ public:
+  SavedTensorsHooks() = default;
+
+  ~SavedTensorsHooks() {}
+
+  void SetHooks(PyObject* pack_hook, PyObject* unpack_hook) {
+#if !(defined(PADDLE_NO_PYTHON) && defined(PADDLE_ON_INFERENCE))
+    PADDLE_ENFORCE_EQ(pack_hook_ == nullptr && unpack_hook_ == nullptr,
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "paddle.autograd.saved_tensors_hooks only one pair "
+                          "of hooks is allowed at a time."));
+    pack_hook_ = std::make_shared<PackHook>(pack_hook);
+    unpack_hook_ = std::make_shared<UnPackHook>(unpack_hook);
+    is_enable_ = true;
+#endif
+  }
+
+  void ResetHooks() {
+#if !(defined(PADDLE_NO_PYTHON) && defined(PADDLE_ON_INFERENCE))
+    pack_hook_ = nullptr;
+    unpack_hook_ = nullptr;
+    is_enable_ = false;
+#endif
+  }
+
+  bool IsEnable() { return is_enable_; }
+
+  std::shared_ptr<PackHookBase> GetPackHook() { return pack_hook_; }
+  std::shared_ptr<UnPackHookBase> GetUnPackHook() { return unpack_hook_; }
+
+  static SavedTensorsHooks& GetInstance() {
+    static SavedTensorsHooks instance;
+    return instance;
+  }
+
+ private:
+  std::shared_ptr<PackHookBase> pack_hook_;
+  std::shared_ptr<UnPackHookBase> unpack_hook_;
+  bool is_enable_{false};
+};
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index a6fd57ac6a4bc..35a4c83257f6a 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -27,6 +27,7 @@
 #pragma once
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/saved_tensors_hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 
@@ -69,7 +70,20 @@ class TensorWrapper {
             "Unrecognized tensor type for no_need_buffer feature"));
       }
     } else {
-      intermidiate_tensor_.set_impl(tensor.impl());
+      if (SavedTensorsHooks::GetInstance().IsEnable() &&
+          tensor.is_dense_tensor()) {
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(tensor.impl().get());
+        intermidiate_tensor_.set_impl(
+            std::move(std::make_shared<phi::DenseTensor>(
+                std::make_shared<phi::Allocation>(nullptr, 0, tensor.place()),
+                dense_tensor->meta())));
+        auto pack_hook = SavedTensorsHooks::GetInstance().GetPackHook();
+        unpack_hook_ = SavedTensorsHooks::GetInstance().GetUnPackHook();
+        packed_value_ = reinterpret_cast<PyObject*>((*pack_hook)(tensor));
+      } else {
+        intermidiate_tensor_.set_impl(tensor.impl());
+      }
     }
 
     if (VLOG_IS_ON(7)) {
@@ -86,6 +100,29 @@ class TensorWrapper {
     }
   }
 
+  TensorWrapper(const TensorWrapper& other) {
+    no_need_buffer_ = other.no_need_buffer_;
+    intermidiate_tensor_ = other.intermidiate_tensor_;
+    weak_grad_node_ = other.weak_grad_node_;
+    inplace_version_snapshot_ = other.inplace_version_snapshot_;
+    packed_value_ = other.packed_value_;
+    unpack_hook_ = other.unpack_hook_;
+    Py_XINCREF(packed_value_);
+  }
+
+  TensorWrapper& operator=(const TensorWrapper& other) {
+    no_need_buffer_ = other.no_need_buffer_;
+    intermidiate_tensor_ = other.intermidiate_tensor_;
+    weak_grad_node_ = other.weak_grad_node_;
+    inplace_version_snapshot_ = other.inplace_version_snapshot_;
+    packed_value_ = other.packed_value_;
+    unpack_hook_ = other.unpack_hook_;
+    Py_XINCREF(packed_value_);
+    return *this;
+  }
+
+  ~TensorWrapper() { Py_XDECREF(packed_value_); }
+
   paddle::experimental::Tensor recover() {
     VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name()
             << " for wrapper";
@@ -94,16 +131,25 @@ class TensorWrapper {
       return paddle::experimental::Tensor();
     }
 
-    check_inplace_version();
+    if (packed_value_ && unpack_hook_) {
+      auto tensor_unpacked =
+          (*unpack_hook_)(reinterpret_cast<void*>(packed_value_));
+      auto src_dense_tensor =
+          static_cast<phi::DenseTensor*>(tensor_unpacked.impl().get());
+      static_cast<phi::DenseTensor*>(intermidiate_tensor_.impl().get())
+          ->ResetHolder(src_dense_tensor->MoveMemoryHolder());
+    } else {
+      check_inplace_version();
+    }
 
     paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
 
     std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
     if (new_grad_node) {
-      VLOG(3) << "Recovered TensorWrapper with GradNode "
+      VLOG(7) << "Recovered TensorWrapper with GradNode "
               << new_grad_node->name() << " addr: " << new_grad_node.get();
     } else {
-      VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+      VLOG(7) << "Recovered TensorWrapper with Empty GradNode";
     }
     auto* intermediate_autograd_meta =
         EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
@@ -129,7 +175,7 @@ class TensorWrapper {
  private:
   void check_inplace_version() {
     if (no_need_buffer_) {
-      VLOG(6) << "There's no need to check inplace_version because "
+      VLOG(7) << "There's no need to check inplace_version because "
                  "no_need_buffer_ is true.";
       return;
     }
@@ -154,10 +200,10 @@ class TensorWrapper {
               intermidiate_tensor_.name(),
               tensor_version,
               wrapper_version_snapshot));
-      VLOG(6) << " The wrapper_version_snapshot of Tensor '"
+      VLOG(7) << " The wrapper_version_snapshot of Tensor '"
               << intermidiate_tensor_.name() << "' is [ "
               << wrapper_version_snapshot << " ]";
-      VLOG(6) << " The tensor_version of Tensor '"
+      VLOG(7) << " The tensor_version of Tensor '"
               << intermidiate_tensor_.name() << "' is [ " << tensor_version
               << " ]";
     }
@@ -168,5 +214,7 @@ class TensorWrapper {
   paddle::experimental::Tensor intermidiate_tensor_;
   std::weak_ptr<egr::GradNodeBase> weak_grad_node_;
   uint32_t inplace_version_snapshot_ = 0;
+  PyObject* packed_value_{nullptr};
+  std::shared_ptr<UnPackHookBase> unpack_hook_;
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 144ceab1e4983..515def46b6413 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -77,7 +77,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
 
   size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
   for (size_t i = 0; i < max_num_runs; i++) {
-    input_tensor0 = matmul_dygraph_function(input_tensor0, Y, false, false);
+    input_tensor0 = matmul_ad_func(input_tensor0, Y, false, false);
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 33da489fd47b1..23ba88c8898c1 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -54,7 +54,7 @@ static void clear_no_grad_edges_with_partial_block(
   }
 }
 
-inline void run_program_dygraph_function(
+inline void run_program_ad_func(
     const std::vector<paddle::experimental::Tensor>& x,
     const std::vector<paddle::experimental::Tensor>& params,
     std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 33e2c84099e03..777929bbc7536 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -296,7 +296,7 @@ void EagerUtils::HandleViewBetweenInputAndOutput(
     view_output_dense_tensor->ShareInplaceVersionCounterWith(
         *input_dense_tensor);
 
-    VLOG(3) << "Perform View between Output Tensor("
+    VLOG(4) << "Perform View between Output Tensor("
             << view_output_tensor->name() << ") and Input Tensor("
             << input_tensor.name()
             << "), share allocation and inplace version.";
@@ -409,7 +409,7 @@ std::vector<paddle::experimental::Tensor> EagerUtils::RecoverTensorWrapper(
   }
   return ret;
 }
-
+// TODO(jiabin): remove all this when we fix all test using tmp grad
 void EagerUtils::CheckAndRetainGrad(
     const paddle::experimental::Tensor& tensor) {
   VLOG(6) << "Check RetainGradForTensor: " << tensor.name();
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index e82d8d03a0096..0f9460febbc5d 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -230,6 +230,7 @@ class EagerUtils {
       const std::vector<paddle::experimental::Tensor>& tensors);
   static void CheckAndRetainGrad(
       const std::vector<paddle::experimental::Tensor*>& tensors);
+
   static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
       const paddle::experimental::Tensor& tensor);
 
@@ -246,6 +247,184 @@ class EagerUtils {
   static void FillZeroForEmptyGradInput(
       std::vector<paddle::experimental::Tensor>* in_grads,
       const std::vector<GradSlotMeta>& grad_in_metas);
+  /**
+   * Print Input Output (level 0 means least info, level 2 means most info)
+   * **/
+  static const std::string TensorStr(const paddle::experimental::Tensor& t) {
+    std::string tensor_name_str = "";
+    if (t.name() == "") {
+      tensor_name_str = "None";
+    } else {
+      tensor_name_str = t.name();
+    }
+    const char* TENSOR_INFO_TEMPLATE =
+        "{ Type: [ \"%s\" ], Dtype:[ \"%s\" ], Place:[ \"%s\" ] }";
+    std::string tensor_info_str = "";
+    if (t.defined()) {
+      if (t.initialized()) {
+        tensor_info_str += paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                                   t.impl()->type_info().name(),
+                                                   t.dtype(),
+                                                   t.place().DebugString());
+      } else {
+        tensor_info_str += paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                                   t.impl()->type_info().name(),
+                                                   "Unknown",
+                                                   "Unknown");
+      }
+    } else {
+      tensor_info_str += "Unknown";
+    }
+    if (VLOG_IS_ON(6)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{ Name:[ \"%s\" ], Initialized: [ \"%d\" ], Ptr: [ \"%d\" ] "
+          "TensorInfo: [ \"%s\" ], ADInfo:[ \"%s\" ] }";
+      auto* ad_meta = nullable_autograd_meta(t);
+      if (!ad_meta && !(ad_meta->WeakGrad().lock().get())) {
+        std::string ad_info_str = "";
+        const char* AD_INFO_TEMPLATE =
+            "{ Grad: [ \"%s\" ],  GradNode: [ %s ], StopGradient: [ %d ] }";
+        ad_info_str += paddle::string::Sprintf(AD_INFO_TEMPLATE,
+                                               TensorStr(ad_meta->Grad()),
+                                               GradNodeStr(t),
+                                               ad_meta->StopGradient());
+        return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                       tensor_name_str,
+                                       t.initialized(),
+                                       t.impl(),
+                                       tensor_info_str,
+                                       ad_info_str);
+      } else {
+        return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                       tensor_name_str,
+                                       t.initialized(),
+                                       t.impl(),
+                                       tensor_info_str,
+                                       "None");
+      }
+    } else if (VLOG_IS_ON(5)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{ Name:[ \"%s\" ], Initialized: [ \"%d\" ], Ptr: [ \"%d\" ] "
+          "TensorInfo: [ \"%s\" ] }";
+      return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                     tensor_name_str,
+                                     t.initialized(),
+                                     t.impl(),
+                                     tensor_info_str);
+    } else if (VLOG_IS_ON(4)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{ Name:[ \"%s\" ], Initialized: [ \"%d\" ], Ptr: [ \"%d\" ] }";
+      return paddle::string::Sprintf(
+          TENSOR_PRINT_TEMPLATE, tensor_name_str, t.initialized(), t.impl());
+    } else {
+      return "[ Not specified tensor log level ]";
+    }
+  }
+
+  static const std::string GradNodeStr(const egr::GradNodeBase& node) {
+    if (VLOG_IS_ON(6)) {
+      const char* GRAD_NODE_TEMPLATE =
+          " { BackwardOutMeta: [ %s ], BackwardInMeta: [ %s ] }";
+      const char* GRAD_SLOT_META_TEMPLATE = " {SlotSize: [%d]: %s} ";
+      const char* SLOT_INFO_TEMPLATE =
+          " {SlotID: [\"%s\"], StopGradients: [ %s ], Edges[ %s ] }";
+      auto out_metas = node.OutputMeta();
+      auto in_metas = node.InputMeta();
+      std::string out_slot_str = "";
+      std::string in_slot_str = "";
+      const char* EDGE_INFO_TEMPLATE = " { [%d, %d]: [%s, %s] }, ";
+      std::string slot_str = "";
+      for (size_t i = 0; i < out_metas.size(); i++) {
+        std::string edges_str = "";
+        std::string sg_str = "";
+        for (const GradSlotMeta& meta : out_metas[i]) {
+          const egr::Edge& edge = meta.GetEdge();
+          if (edge.IsInitialized()) {
+            edges_str += paddle::string::Sprintf(EDGE_INFO_TEMPLATE,
+                                                 edge.GetEdgeRankInfo().first,
+                                                 edge.GetEdgeRankInfo().second,
+                                                 edge.GetGradNode(),
+                                                 edge.GetGradNode()->name());
+          } else {
+            edges_str += paddle::string::Sprintf("{ NULL Edge }");
+          }
+          sg_str += meta.IsStopGradient() ? "1, " : "0, ";
+        }
+        out_slot_str +=
+            paddle::string::Sprintf(SLOT_INFO_TEMPLATE, i, sg_str, edges_str);
+      }
+      std::string out_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, out_metas.size(), out_slot_str);
+
+      for (size_t i = 0; i < in_metas.size(); i++) {
+        std::string edges_str = "";
+        std::string sg_str = "";
+        for (const GradSlotMeta& meta : in_metas[i]) {
+          edges_str += paddle::string::Sprintf("{ NULL Edge }");
+          sg_str += meta.IsStopGradient() ? "1, " : "0, ";
+        }
+        in_slot_str +=
+            paddle::string::Sprintf(SLOT_INFO_TEMPLATE, i, sg_str, edges_str);
+      }
+      std::string in_meta_str =
+          paddle::string::Sprintf(GRAD_SLOT_META_TEMPLATE, in_slot_str);
+      return paddle::string::Sprintf(
+          GRAD_NODE_TEMPLATE, out_meta_str, in_meta_str);
+    } else if (VLOG_IS_ON(5)) {
+      const char* GRAD_NODE_TEMPLATE =
+          " { BackwardOutMeta: [ %s ], BackwardInMeta: [ %s ] }";
+      const char* GRAD_SLOT_META_TEMPLATE = "SlotSize: [\"%d\"]";
+      std::string out_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, node.OutputMeta().size());
+      std::string in_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, node.InputMeta().size());
+      return paddle::string::Sprintf(
+          GRAD_NODE_TEMPLATE, out_meta_str, in_meta_str);
+    } else {
+      return "[ Not specified grad node log level. ] ";
+    }
+  }
+
+  static const std::string GradNodeStr(const paddle::experimental::Tensor& t) {
+    auto* ad_meta = nullable_autograd_meta(t);
+    if (ad_meta && !(ad_meta->GetMutableGradNode().get())) {
+      return GradNodeStr((*ad_meta->GetMutableGradNode().get()));
+    } else {
+      return "None";
+    }
+  }
+
+  static const std::string TensorStr(
+      const std::vector<paddle::experimental::Tensor>& tensors) {
+    std::string tensors_str = "";
+    for (const auto& tensor : tensors) {
+      tensors_str += TensorStr(tensor) + ", ";
+    }
+    return "[ " + tensors_str + " ]";
+  }
+
+  static const std::string TensorStr(
+      const paddle::optional<paddle::experimental::Tensor>& t) {
+    if (!t.is_initialized()) {
+      return "{ UnDefinedTensor }";
+    } else {
+      return TensorStr((*t.get_ptr()));
+    }
+  }
+
+  static const std::string TensorStr(
+      const paddle::optional<std::vector<paddle::experimental::Tensor>>&
+          tensors) {
+    std::string tensors_str = "";
+    if (!tensors.is_initialized()) {
+      return "[ UnDefinedTensor List ]";
+    } else {
+      for (const auto& tensor : (*tensors.get_ptr())) {
+        tensors_str += TensorStr(tensor) + ", ";
+      }
+      return "[ " + tensors_str + " ]";
+    }
+  }
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 8c8d702e28f42..c58d1a57ec466 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -511,9 +511,9 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Custom Operator.
 
-According to the Tensor operation function implemented by the user 
-independently of the framework, it is encapsulated into a framework 
-operator to adapt to various execution scenarios such as dynamic graph, 
+According to the Tensor operation function implemented by the user
+independently of the framework, it is encapsulated into a framework
+operator to adapt to various execution scenarios such as dynamic graph,
 mode static graph mode, and inference mode.
 
 )DOC");
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 4bf81b46b3456..3c6a89f2939a7 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -171,7 +171,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
   out->set_mem_desc(out_mem_desc);
   out->Resize(in.dims());
 
-  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
+  // Note(0x45f): Using initialized() to support slice Tensors
+  // with shapes like [0, 0, 0].
+  if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) {
     void* in_data = GetDataFromTensor(in, in_type);
 
     platform::ReorderMKLDNNHandler handler(
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index ab63b489a2eda..fd1c06fc6458e 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index c383342ee3456..22d3ac4333fb6 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdint.h>
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index fccb2ee5a7550..06962f7b5e773 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -169,7 +169,12 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to
   // explicit synchronization.
 #ifdef PADDLE_WITH_ASCEND_CL
-  if (op_type == kMemcpyD2H) {
+  if (op_type == kMemcpyD2H && platform::is_npu_place(dev_ctx->GetPlace())) {
+    dev_ctx->Wait();
+  }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (op_type == kMemcpyD2H && platform::is_custom_place(dev_ctx->GetPlace())) {
     dev_ctx->Wait();
   }
 #endif
@@ -363,11 +368,12 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
                         src_place));
   if (IsSupportedHetePlace(dst_place)) {
     op_type = kMemcpyH2D;
-    int dst_place_type = platform::is_gpu_place(dst_place)   ? 0
-                         : platform::is_npu_place(dst_place) ? 1
-                         : platform::is_ipu_place(dst_place) ? 3
-                         : platform::is_xpu_place(dst_place) ? 2
-                                                             : -1;
+    int dst_place_type = platform::is_gpu_place(dst_place)      ? 0
+                         : platform::is_npu_place(dst_place)    ? 1
+                         : platform::is_ipu_place(dst_place)    ? 3
+                         : platform::is_xpu_place(dst_place)    ? 2
+                         : platform::is_custom_place(dst_place) ? 6
+                                                                : -1;
     attr_map = {{"dst_place_type", dst_place_type}};
   } else if (IsSupportedHetePlace(src_place)) {
     op_type = kMemcpyD2H;
diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc
index a381943587d03..c1ba3b193f1de 100644
--- a/paddle/fluid/framework/new_executor/executor_statistics.cc
+++ b/paddle/fluid/framework/new_executor/executor_statistics.cc
@@ -600,9 +600,9 @@ void StatisticsEngine::Log(const std::string& filepath) {
   for (size_t idx = 0; idx < statistics_.size(); ++idx) {
     const auto& evt_stat = statistics_[idx];
     ofs << platform::string_format(std::string(R"JSON(
-  { 
-    "statistical item" : "%s", 
-    "total time(ns)" : %llu, 
+  {
+    "statistical item" : "%s",
+    "total time(ns)" : %llu,
     "total number of times" : %llu,
     "normalization time(ns)" : %llu
   },)JSON"),
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 0a0170110de2a..63d6fcbf823e4 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -165,7 +165,14 @@ paddle::framework::FetchList InterpreterCore::Run(
 
     ExecuteInstructionList(vec_instruction_);
 #ifdef PADDLE_WITH_ASCEND_CL
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    if (platform::is_npu_place(place_)) {
+      platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (platform::is_custom_place(place_)) {
+      platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    }
 #endif
   }
   if (create_local_scope_) {
@@ -223,7 +230,14 @@ paddle::framework::FetchList InterpreterCore::Run(
 
     ExecuteInstructionList(vec_instruction_);
 #ifdef PADDLE_WITH_ASCEND_CL
-    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    if (platform::is_npu_place(place_)) {
+      platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (platform::is_custom_place(place_)) {
+      platform::DeviceContextPool::Instance().Get(place_)->Wait();
+    }
 #endif
   }
 
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 2df8892f5bd8a..4f065f2452e28 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -399,7 +399,8 @@ static bool IsCpuOp(const Instruction& instr) {
 // is supported heterogeneous place
 static bool IsSupportedHetePlace(const phi::Place& place) {
   return platform::is_gpu_place(place) || platform::is_npu_place(place) ||
-         platform::is_xpu_place(place) || platform::is_ipu_place(place);
+         platform::is_xpu_place(place) || platform::is_ipu_place(place) ||
+         platform::is_custom_place(place);
 }
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index 760a852baee68..3025f017471c7 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -30,7 +30,8 @@ std::mutex ctx_mtx;
 }  // namespace
 
 StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) {
-  if (platform::is_gpu_place(place) || platform::is_npu_place(place)) {
+  if (platform::is_gpu_place(place) || platform::is_npu_place(place) ||
+      platform::is_custom_place(place)) {
     std::lock_guard<std::mutex> lk(ctx_mtx);
     if (d2h_ctxs == nullptr) {
       d2h_ctxs = new std::map<
@@ -178,7 +179,8 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
   auto* dev_ctx = op_func_node.dev_ctx_;
   // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
-  if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) {
+  if (platform::is_gpu_place(place_) || platform::is_npu_place(place_) ||
+      platform::is_custom_place(place_)) {
     if (op_type == interpreter::kMemcpyD2H) {
       VLOG(3) << "Get dev_ctx from d2h_context_pool_";
       dev_ctx = d2h_ctx_.get().get();
@@ -209,7 +211,7 @@ bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
     return true;
 
   // npu d2h kernel is asynchronous.
-  if (platform::is_npu_place(place_)) {
+  if (platform::is_npu_place(place_) || platform::is_custom_place(place_)) {
     return interpreter::IsCpuOp(cur_instr) ||
            interpreter::IsMemcpyH2D(next_instr);
   }
@@ -227,6 +229,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
       return platform::kXPU;
     } else if (platform::is_npu_place(place_)) {
       return platform::kNPU;
+    } else if (platform::is_custom_place(place_)) {
+      return platform::kCUSTOM_DEVICE;
     }
     return platform::kCUDA;
   }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 4d0d10c7836c1..a725521c42347 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -998,16 +998,25 @@ void OpDesc::Flush() {
 
     std::vector<std::pair<std::string, Attribute>> sorted_attrs{attrs_.begin(),
                                                                 attrs_.end()};
+
+    std::vector<std::pair<std::string, Attribute>> sorted_runtime_attrs{
+        runtime_attrs_.begin(), runtime_attrs_.end()};
+
     std::sort(
         sorted_attrs.begin(),
         sorted_attrs.end(),
         [](std::pair<std::string, Attribute> a,
            std::pair<std::string, Attribute> b) { return a.first < b.first; });
+    std::sort(
+        sorted_runtime_attrs.begin(),
+        sorted_runtime_attrs.end(),
+        [](std::pair<std::string, Attribute> a,
+           std::pair<std::string, Attribute> b) { return a.first < b.first; });
 
     for (auto &attr : sorted_attrs) {
       set_attr_desc(attr.first, attr.second);
     }
-    for (auto &attr : runtime_attrs_) {
+    for (auto &attr : sorted_runtime_attrs) {
       set_attr_desc(attr.first, attr.second);
     }
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 1140db207af9d..62bbf77a2df1d 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -210,6 +210,7 @@ PreparedOp PrepareImpl(
       kernel_signature = *default_kernel_signature;
     }
   }
+
   if (has_phi_kernel) {
     VLOG(6) << kernel_signature;
     phi_kernel_name = kernel_signature.name;
@@ -217,13 +218,10 @@ PreparedOp PrepareImpl(
 // But the default library_type is Plain, so we need to modify the
 // library_type here, otherwise it can't work.
 #ifdef PADDLE_WITH_XPU_KP
-    bool is_kp_support = false;
     if (paddle::platform::is_xpu_place(expected_kernel_key.place_)) {
       bool use_xpu_kp_kernel_rt =
-          FLAGS_run_kp_kernel &&
-          paddle::platform::is_xpu_kp_support_op(op.Type(),
-                                                 expected_kernel_key) &&
-          (!paddle::platform::is_in_xpu_black_list(op.Type()));
+          FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(
+                                     op.Type(), expected_kernel_key);
       bool use_xpu_kp_kernel_debug =
           paddle::platform::is_in_xpu_kpwhite_list(op.Type());
       if (use_xpu_kp_kernel_rt) {
@@ -234,7 +232,6 @@ PreparedOp PrepareImpl(
       }
       bool is_xpu_kp_support =
           (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
-      is_kp_support = is_xpu_kp_support;
       if (is_xpu_kp_support) {
         auto expected_kernel_key_library_type =
             expected_kernel_key.library_type_;
@@ -264,9 +261,6 @@ PreparedOp PrepareImpl(
     if (phi_kernel.IsValid()
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
-#endif
-#if defined(PADDLE_WITH_XPU_KP)
-        && is_kp_support
 #endif
     ) {
       VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << phi_kernel_name
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fbc2830aff614..445145dde3954 100755
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2185,6 +2185,8 @@ USE_TRT_CONVERTER(shape)
 USE_TRT_CONVERTER(fill_constant)
 USE_TRT_CONVERTER(fused_token_prune)
 USE_TRT_CONVERTER(layernorm_shift_partition)
+USE_TRT_CONVERTER(generic_plugin_creater)
+USE_TRT_CONVERTER(custom_plugin_creater)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
 USE_TRT_CONVERTER(sparse_multihead_matmul)
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 7239b506d33f6..d4a4c8c06af75 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -12,10 +12,18 @@ else()
     SRCS engine.cc trt_int8_calibrator.cc
     DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 endif()
+nv_library(
+  tensorrt_dynamic_shape_infermeta_factory
+  SRCS dynamic_shape_infermeta.cc
+  DEPS framework_proto)
+nv_library(
+  tensorrt_plugin_arg_mapping_context
+  SRCS plugin_arg_mapping_context.cc
+  DEPS framework_proto)
 nv_library(
   tensorrt_op_teller
   SRCS op_teller.cc
-  DEPS framework_proto device_context)
+  DEPS framework_proto device_context tensorrt_dynamic_shape_infermeta_factory)
 nv_test(
   test_tensorrt
   SRCS test_tensorrt.cc
@@ -24,6 +32,10 @@ nv_test(
   test_tensorrt_engine
   SRCS test_engine.cc test_dynamic_engine.cc
   DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
+nv_test(
+  test_arg_mapping_context
+  SRCS test_arg_mapping_context.cc
+  DEPS framework_proto tensorrt_plugin_arg_mapping_context)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ce95363b72d0b..60a5d0f282525 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -76,7 +76,8 @@ list(
   shape_op.cc
   fill_constant_op.cc
   fused_token_prune_op.cc
-  layernorm_shift_partition_op.cc)
+  layernorm_shift_partition_op.cc
+  generic_and_custom_plugin_creater.cc)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
@@ -85,7 +86,12 @@ endif()
 nv_library(
   tensorrt_converter
   SRCS ${CONVERT_FILES}
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
+  DEPS tensorrt_engine
+       tensorrt_plugin
+       operator
+       scope
+       framework_proto
+       tensorrt_op_teller
        op_registry)
 
 nv_test(
@@ -94,6 +100,11 @@ nv_test(
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
        tensorrt_converter)
 
+nv_test(
+  test_custom_plugin_creater
+  SRCS test_custom_plugin_creater.cc
+  DEPS paddle_framework tensorrt_converter op_meta_info custom_operator)
+
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
   # be build only in CI, so suppose the generator in Windows is Ninja.
diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
new file mode 100644
index 0000000000000..e1ce9ceb0208b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
@@ -0,0 +1,248 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class CustomPluginCreater : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    VLOG(3) << "convert " << op_desc.Type() << " op to custom pluign layer";
+
+    std::string plugin_name;
+
+    if (engine_->with_dynamic_shape()) {
+      plugin_name = op_desc.Type() + "_paddle_trt_dynamic_plugin";
+    } else {
+      plugin_name = op_desc.Type() + "_paddle_trt_plugin";
+    }
+
+    nvinfer1::ILayer *layer = nullptr;
+    std::vector<nvinfer1::ITensor *> inputs;
+
+    auto &op_meta_info_map = OpMetaInfoMap::Instance();
+    const auto &meta_info_map = op_meta_info_map.GetMap();
+    auto &op_info = meta_info_map.at(op_desc.Type()).front();
+
+    // set inputs
+    auto &op_input_names = framework::OpMetaInfoHelper::GetInputs(op_info);
+    for (auto &param_name : op_input_names) {
+      for (auto &arg_name : op_desc.Input(param_name)) {
+        framework::Variable *X_v = nullptr;
+        X_v = scope.FindVar(arg_name);
+        // If this weight is not shared between ops, it need to be convtered to
+        // itensor
+        if (X_v && !engine_->GetITensorMap()->count(arg_name)) {
+          ConvertWeight2ITensor(scope, arg_name);
+        }
+        inputs.push_back(engine_->GetITensor(arg_name));
+      }
+    }
+    auto creator =
+        GetPluginRegistry()->getPluginCreator(plugin_name.c_str(), "1");
+    CHECK(creator);
+
+    // set attrs
+    std::vector<nvinfer1::PluginField> plugindatas;
+    auto &op_attrs_names = framework::OpMetaInfoHelper::GetAttrs(op_info);
+    auto &attrs = op_desc.GetAttrMap();
+
+    std::list<int> int_attrs;
+    std::list<float> float_attrs;
+    std::list<double> bool_attrs;
+    std::list<std::string> string_attrs;
+    std::list<std::vector<int>> ints_attrs;
+    std::list<std::vector<float>> floats_attrs;
+
+    for (auto &attr_name : op_attrs_names) {
+      nvinfer1::PluginField plugindata;
+      plugindata.name = attr_name.c_str();
+      if (op_desc.GetAttrType(attr_name) == framework::proto::AttrType::INT) {
+        int_attrs.push_back(PADDLE_GET_CONST(int, attrs.at(attr_name)));
+        plugindata.data = &int_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::FLOAT) {
+        float_attrs.push_back(PADDLE_GET_CONST(float, attrs.at(attr_name)));
+        plugindata.data = &float_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kFLOAT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::BOOLEAN) {
+        int_attrs.push_back(PADDLE_GET_CONST(bool, attrs.at(attr_name)));
+        plugindata.data = &int_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::STRING) {
+        string_attrs.push_back(
+            PADDLE_GET_CONST(std::string, attrs.at(attr_name)));
+        plugindata.data = string_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kCHAR;
+        plugindata.length =
+            string_attrs.back().size() + 1;  // string ends with ‘\0’
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::INTS) {
+        ints_attrs.push_back(
+            PADDLE_GET_CONST(std::vector<int>, attrs.at(attr_name)));
+        plugindata.data = ints_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = ints_attrs.back().size();
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::FLOATS) {
+        floats_attrs.push_back(
+            PADDLE_GET_CONST(std::vector<float>, attrs.at(attr_name)));
+        plugindata.data = floats_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kFLOAT32;
+        plugindata.length = floats_attrs.back().size();
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::BOOLEANS) {
+        auto bools_attr =
+            PADDLE_GET_CONST(std::vector<bool>, attrs.at(attr_name));
+        std::vector<int> convert_to_ints_attr;
+        for (bool i : bools_attr) convert_to_ints_attr.push_back(i);
+        ints_attrs.push_back(convert_to_ints_attr);
+        plugindata.data = ints_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = ints_attrs.back().size();
+      } else {
+        CHECK(false) << "UNKNOWN PluginFieldType.";
+      }
+      plugindatas.push_back(plugindata);
+    }
+
+    nvinfer1::PluginFieldCollection plugin_fc{(int32_t)plugindatas.size(),
+                                              plugindatas.data()};
+
+    auto *plugin = creator->createPlugin(op_desc.Type().c_str(), &plugin_fc);
+    CHECK(plugin);
+
+    if (engine_->with_dynamic_shape()) {
+      layer =
+          engine_->AddDynamicPlugin(inputs.data(),
+                                    inputs.size(),
+                                    (plugin::DynamicPluginTensorRT *)plugin);
+    } else {
+      layer = engine_->AddPlugin(
+          inputs.data(), inputs.size(), (plugin::PluginTensorRT *)plugin);
+    }
+
+    CHECK(layer);
+
+    // set outputs
+    auto &op_output_names = framework::OpMetaInfoHelper::GetOutputs(op_info);
+    std::vector<std::string> output_names;
+    for (auto &param_name : op_output_names) {
+      for (auto &arg_name : op_desc.Output(param_name))
+        output_names.push_back(arg_name);
+    }
+
+    RreplenishLayerAndOutput(layer, op_desc.Type(), output_names, test_mode);
+  }
+};
+
+class GenericPluginCreater : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    CHECK(block_);
+    const framework::BlockDesc block_desc(
+        nullptr, const_cast<framework::proto::BlockDesc *>(block_));
+
+    nvinfer1::ILayer *layer = nullptr;
+    std::vector<nvinfer1::ITensor *> inputs;
+
+    phi::KernelSignature phi_kernel_signature;
+    if (phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_desc.Type())) {
+      const phi::ArgumentMappingFn *argument_mapping_func =
+          phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_desc.Type());
+      PluginArgumentMappingContext argument_mapping_context(&op_desc);
+      phi_kernel_signature = (*argument_mapping_func)(argument_mapping_context);
+    } else {
+      phi_kernel_signature =
+          phi::DefaultKernelSignatureMap::Instance().Get(op_desc.Type());
+    }
+
+    plugin::GenericPlugin::InputOutPutVarInfo in_out_info;
+
+    for (auto &param_name : phi_kernel_signature.input_names) {
+      for (auto &arg_name : op_desc.Input(param_name)) {
+        framework::Variable *X_v = nullptr;
+        X_v = scope.FindVar(arg_name);
+        // If this weight is not shared between ops, it need to be convtered to
+        // itensor
+        if (X_v && !engine_->GetITensorMap()->count(arg_name)) {
+          ConvertWeight2ITensor(scope, arg_name);
+        }
+
+        inputs.push_back(engine_->GetITensor(arg_name));
+        auto *var = block_desc.FindVar(arg_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            var,
+            platform::errors::NotFound(
+                "There is no variable called %s in block.", arg_name.c_str()));
+        PADDLE_ENFORCE_EQ(
+            var->GetType(),
+            FluidDT::VarType_Type_LOD_TENSOR,
+            platform::errors::InvalidArgument("TensorRT engine only takes "
+                                              "LoDTensor as input"));
+        in_out_info.inputs_data_type.push_back(var->GetDataType());
+      }
+    }
+
+    std::vector<std::string> output_names;
+    for (auto &param_name : phi_kernel_signature.output_names) {
+      for (auto &arg_name : op_desc.Output(param_name)) {
+        output_names.push_back(arg_name);
+        auto *var = block_desc.FindVar(arg_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            var,
+            platform::errors::NotFound(
+                "There is no variable called %s in block.", arg_name.c_str()));
+        PADDLE_ENFORCE_EQ(
+            var->GetType(),
+            FluidDT::VarType_Type_LOD_TENSOR,
+            platform::errors::InvalidArgument("TensorRT engine only takes "
+                                              "LoDTensor as input"));
+        in_out_info.outputs_data_type.push_back(var->GetDataType());
+      }
+    }
+    plugin::GenericPlugin *plugin = new plugin::GenericPlugin(op, in_out_info);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+
+    RreplenishLayerAndOutput(layer, op_desc.Type(), output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(custom_plugin_creater, CustomPluginCreater);
+REGISTER_TRT_OP_CONVERTER(generic_plugin_creater, GenericPluginCreater);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index cdd6345c48441..095457dbfbbba 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
@@ -49,111 +50,135 @@ class OpConverter {
                  const std::unordered_set<std::string>& parameters,
                  const framework::Scope& scope,
                  TensorRTEngine* engine,
-                 bool test_mode = false) {
+                 bool test_mode = false,
+                 const framework::proto::BlockDesc* block = nullptr) {
     framework::OpDesc op_desc(op, nullptr);
 
     OpConverter* it{nullptr};
 
-    if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-                        1UL,
-                        platform::errors::InvalidArgument(
-                            "The input op mul's Input(\"Y\")."
-                            "size() should equal to 1, but reveceid "
-                            "Input(\"Y\").size() = %u.",
-                            op_desc.Input("Y").size()));
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        it = Registry<OpConverter>::Global().Lookup("fc");
-      }
-    }
-    if (op_desc.Type().find("elementwise") != std::string::npos) {
-      static std::unordered_set<std::string> add_tensor_op_set{
-          "add", "mul", "sub", "div", "max", "min", "pow"};
-      static std::unordered_set<std::string> add_weight_op_set{
-          "add", "mul", "sub", "div", "pow"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-                        1UL,
-                        platform::errors::InvalidArgument(
-                            "The input op's Input(\"Y\")."
-                            "size() should equal to 1, but reveceid "
-                            "Input(\"Y\").size() = %u.",
-                            op_desc.Input("Y").size()));
-      int op_type_len = op_desc.Type().size();
-      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        PADDLE_ENFORCE_GT(
-            add_weight_op_set.count(op_type),
-            0,
-            platform::errors::Unimplemented("Unsupported elementwise type %s",
-                                            op_type.c_str()));
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_weight");
-        PADDLE_ENFORCE_NOT_NULL(
-            it,
-            platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                            op_desc.Type()));
-      } else {
-        PADDLE_ENFORCE_GT(
-            add_tensor_op_set.count(op_type),
-            0,
-            platform::errors::Unimplemented("Unsupported elementwise type %s",
-                                            op_type.c_str()));
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_tensor");
-      }
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
+    auto op_converter_type_map = OpTeller::Global().GetOpConverterTypeMap();
+    switch (op_converter_type_map.at(op_desc.Type())) {
+      case OpConverterType::Default:
+        if (op_desc.Type() == "mul") {
+          PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
+                            1UL,
+                            platform::errors::InvalidArgument(
+                                "The input op mul's Input(\"Y\")."
+                                "size() should equal to 1, but reveceid "
+                                "Input(\"Y\").size() = %u.",
+                                op_desc.Input("Y").size()));
+          std::string Y = op_desc.Input("Y")[0];
+          if (parameters.count(Y)) {
+            it = Registry<OpConverter>::Global().Lookup("fc");
+          }
+        }
+        if (op_desc.Type().find("elementwise") != std::string::npos) {
+          static std::unordered_set<std::string> add_tensor_op_set{
+              "add", "mul", "sub", "div", "max", "min", "pow"};
+          static std::unordered_set<std::string> add_weight_op_set{
+              "add", "mul", "sub", "div", "pow"};
+          PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
+                            1UL,
+                            platform::errors::InvalidArgument(
+                                "The input op's Input(\"Y\")."
+                                "size() should equal to 1, but reveceid "
+                                "Input(\"Y\").size() = %u.",
+                                op_desc.Input("Y").size()));
+          int op_type_len = op_desc.Type().size();
+          std::string op_type =
+              op_desc.Type().substr(op_type_len - 3, op_type_len);
+          std::string Y = op_desc.Input("Y")[0];
+          if (parameters.count(Y)) {
+            PADDLE_ENFORCE_GT(
+                add_weight_op_set.count(op_type),
+                0,
+                platform::errors::Unimplemented(
+                    "Unsupported elementwise type %s", op_type.c_str()));
+            it = Registry<OpConverter>::Global().Lookup("elementwise_" +
+                                                        op_type + "_weight");
+            PADDLE_ENFORCE_NOT_NULL(
+                it,
+                platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
+          } else {
+            PADDLE_ENFORCE_GT(
+                add_tensor_op_set.count(op_type),
+                0,
+                platform::errors::Unimplemented(
+                    "Unsupported elementwise type %s", op_type.c_str()));
+            it = Registry<OpConverter>::Global().Lookup("elementwise_" +
+                                                        op_type + "_tensor");
+          }
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
 
-    if (op_desc.Type() == "depthwise_conv2d") {
-      it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "depthwise_conv2d_transpose") {
-      it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "transpose2") {
-      it = Registry<OpConverter>::Global().Lookup("transpose");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "flatten2") {
-      it = Registry<OpConverter>::Global().Lookup("flatten");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    // reshape2 == reshape
-    if (op_desc.Type() == "reshape2") {
-      it = Registry<OpConverter>::Global().Lookup("reshape");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (!it) {
-      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
+        if (op_desc.Type() == "depthwise_conv2d") {
+          it = Registry<OpConverter>::Global().Lookup("conv2d");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "depthwise_conv2d_transpose") {
+          it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "transpose2") {
+          it = Registry<OpConverter>::Global().Lookup("transpose");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "flatten2") {
+          it = Registry<OpConverter>::Global().Lookup("flatten");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        // reshape2 == reshape
+        if (op_desc.Type() == "reshape2") {
+          it = Registry<OpConverter>::Global().Lookup("reshape");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (!it) {
+          it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
+        }
+        break;
+
+      case OpConverterType::GenericPluginCreater:
+        LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
+                  << ", now use generic_plugin_creater!";
+        it = Registry<OpConverter>::Global().Lookup("generic_plugin_creater");
+        break;
+
+      case OpConverterType::CustomPluginCreater:
+        LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
+                  << ", now use custom_plugin_creater!";
+        it = Registry<OpConverter>::Global().Lookup("custom_plugin_creater");
+        break;
+
+      default:
+        CHECK(false) << "no OpConverter for optype " << op_desc.Type();
     }
+
     PADDLE_ENFORCE_NOT_NULL(
         it,
         platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                         op_desc.Type()));
 
     it->SetEngine(engine);
+    it->SetBlockDesc(block);
     (*it)(op, scope, test_mode);
 
     size_t output_num = op_desc.OutputNames().size();
@@ -257,7 +282,7 @@ class OpConverter {
     }
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      ConvertOp(op, parameters, scope, engine);
+      ConvertOp(op, parameters, scope, engine, false, &block);
     }
     for (int i = 0; i < engine->network()->getNbLayers(); i++) {
       auto layer = engine->network()->getLayer(i);
@@ -620,10 +645,16 @@ class OpConverter {
   }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
+  void SetBlockDesc(const framework::proto::BlockDesc* block) {
+    block_ = block;
+  }
+
   virtual ~OpConverter() {}
 
   // TensorRT engine
   TensorRTEngine* engine_{nullptr};
+  // BlockDesc
+  const framework::proto::BlockDesc* block_{nullptr};
 
  protected:
   bool test_mode_;
diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h b/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h
new file mode 100644
index 0000000000000..adb41528bae00
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h
@@ -0,0 +1,356 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class custom_op_plugin : public nvinfer1::IPluginV2 {
+ public:
+  explicit custom_op_plugin(float float_attr) { float_attr_ = float_attr; }
+
+  custom_op_plugin(const void* buffer, size_t length) {
+    DeserializeValue(&buffer, &length, &float_attr_);
+  }
+
+  size_t getSerializationSize() const noexcept override {
+    return SerializedSize(float_attr_);
+  }
+
+  void serialize(void* buffer) const noexcept override {
+    SerializeValue(&buffer, float_attr_);
+  }
+
+  nvinfer1::IPluginV2* clone() const noexcept override {
+    return new custom_op_plugin(float_attr_);
+  }
+
+  ~custom_op_plugin() override = default;
+
+  const char* getPluginType() const noexcept override {
+    return "custom_op_paddle_trt_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  int getNbOutputs() const noexcept override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims* inputs,
+                                     int nbInputDims) noexcept override {
+    return inputs[0];
+  }
+
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const noexcept override {
+    return true;
+  }
+
+  void configureWithFormat(nvinfer1::Dims const* inputDims,
+                           int32_t nbInputs,
+                           nvinfer1::Dims const* outputDims,
+                           int32_t nbOutputs,
+                           nvinfer1::DataType type,
+                           nvinfer1::PluginFormat format,
+                           int32_t maxBatchSize) noexcept override {}
+
+  int initialize() noexcept override { return 0; }
+
+  void terminate() noexcept override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const noexcept override {
+    return 0;
+  }
+
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batch_size,
+              const void* const* inputs,
+              void** outputs,
+#else
+  int enqueue(int batch_size,
+              const void* const* inputs,
+              void* const* outputs,
+#endif
+              void* workspace,
+              cudaStream_t stream) noexcept override {
+    return 0;
+  }
+
+  void destroy() noexcept override { delete this; }
+
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
+
+ private:
+  float float_attr_;
+  std::string namespace_;
+};
+
+class custom_op_plugin_creator : public nvinfer1::IPluginCreator {
+ public:
+  custom_op_plugin_creator() {}
+
+  ~custom_op_plugin_creator() override = default;
+
+  const char* getPluginName() const noexcept override {
+    return "custom_op_paddle_trt_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  void setPluginNamespace(const char* pluginNamespace) noexcept override {
+    plugin_namespace_ = pluginNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return plugin_namespace_.c_str();
+  }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override {
+    CHECK_EQ(fc->nbFields, 7);
+    // float_attr
+    auto attr_field = (fc->fields)[0];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kFLOAT32);
+    CHECK_EQ(attr_field.length, 1);
+    float float_value = (reinterpret_cast<const float*>(attr_field.data))[0];
+    CHECK_EQ(float_value, 1.0);
+
+    // int_attr
+    attr_field = (fc->fields)[1];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 1);
+    int int_value = (reinterpret_cast<const int*>(attr_field.data))[0];
+    CHECK_EQ(int_value, 1);
+
+    // bool_attr
+    attr_field = (fc->fields)[2];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 1);
+    int bool_value = (reinterpret_cast<const int*>(attr_field.data))[0];
+    CHECK_EQ(bool_value, 1);
+
+    // string_attr
+    attr_field = (fc->fields)[3];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kCHAR);
+    std::string expect_string_attr = "test_string_attr";
+    CHECK_EQ((size_t)attr_field.length, expect_string_attr.size() + 1);
+    const char* receive_string_attr =
+        reinterpret_cast<const char*>(attr_field.data);
+    CHECK(expect_string_attr == std::string(receive_string_attr));
+
+    // ints_attr
+    attr_field = (fc->fields)[4];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 3);
+    const int* ints_value = reinterpret_cast<const int*>(attr_field.data);
+    CHECK_EQ(ints_value[0], 1);
+    CHECK_EQ(ints_value[1], 2);
+    CHECK_EQ(ints_value[2], 3);
+
+    // floats_attr
+    attr_field = (fc->fields)[5];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kFLOAT32);
+    CHECK_EQ(attr_field.length, 3);
+    const float* floats_value = reinterpret_cast<const float*>(attr_field.data);
+    CHECK_EQ(floats_value[0], 1.0);
+    CHECK_EQ(floats_value[1], 2.0);
+    CHECK_EQ(floats_value[2], 3.0);
+
+    // bools_attr
+    attr_field = (fc->fields)[6];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 3);
+    ints_value = reinterpret_cast<const int*>(attr_field.data);
+    CHECK_EQ(ints_value[0], true);
+    CHECK_EQ(ints_value[1], false);
+    CHECK_EQ(ints_value[2], true);
+
+    return new custom_op_plugin(float_value);
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name,
+      const void* serialData,
+      size_t serialLength) noexcept override {
+    return new custom_op_plugin(serialData, serialLength);
+  }
+
+ private:
+  std::string plugin_namespace_;
+};
+
+class custom_op_dynamic_plugin : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  explicit custom_op_dynamic_plugin(float float_attr)
+      : float_attr_(float_attr) {}
+
+  custom_op_dynamic_plugin(const void* buffer, size_t length) {
+    DeserializeValue(&buffer, &length, &float_attr_);
+  }
+
+  ~custom_op_dynamic_plugin() override = default;
+
+  const char* getPluginType() const noexcept override {
+    return "custom_op_paddle_trt_dynamic_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  int getNbOutputs() const noexcept override { return 1; }
+
+  int initialize() noexcept override { return 0; }
+
+  void terminate() noexcept override {}
+
+  size_t getSerializationSize() const noexcept override {
+    return SerializedSize(float_attr_);
+  }
+
+  void serialize(void* buffer) const noexcept override {
+    SerializeValue(&buffer, float_attr_);
+  }
+
+  void destroy() noexcept override { delete this; }
+
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
+
+  /*IPluginV2Ext method*/
+  nvinfer1::DataType getOutputDataType(
+      int32_t index,
+      nvinfer1::DataType const* inputTypes,
+      int32_t nbInputs) const noexcept override {
+    return inputTypes[index];
+  }
+
+  /*IPluginV2DynamicExt method*/
+  nvinfer1::IPluginV2DynamicExt* clone() const noexcept override {
+    return new custom_op_dynamic_plugin(float_attr_);
+  };
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int32_t outputIndex,
+      const nvinfer1::DimsExprs* inputs,
+      int32_t nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) noexcept override {
+    return inputs[0];
+  }
+
+  bool supportsFormatCombination(int32_t pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int32_t nbInputs,
+                                 int32_t nbOutputs) noexcept override {
+    return true;
+  }
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int32_t nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int32_t nbOutputs) noexcept override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int32_t nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int32_t nbOutputs) const noexcept override {
+    return 0;
+  }
+
+  int32_t enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                  const nvinfer1::PluginTensorDesc* outputDesc,
+                  const void* const* inputs,
+                  void* const* outputs,
+                  void* workspace,
+                  cudaStream_t stream) noexcept override {
+    return 0;
+  }
+
+ private:
+  float float_attr_ = 0;
+  std::string namespace_;
+};
+
+class custom_op_dynamic_plugin_creator : public nvinfer1::IPluginCreator {
+ public:
+  custom_op_dynamic_plugin_creator() {}
+
+  ~custom_op_dynamic_plugin_creator() override = default;
+
+  const char* getPluginName() const noexcept override {
+    return "custom_op_paddle_trt_dynamic_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  void setPluginNamespace(char const* pluginNamespace) noexcept override {
+    plugin_namespace_ = pluginNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return plugin_namespace_.c_str();
+  }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override {
+    return new custom_op_dynamic_plugin(1.0);
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name,
+      const void* serialData,
+      size_t serialLength) noexcept override {
+    return new custom_op_dynamic_plugin(serialData, serialLength);
+  }
+
+ private:
+  std::string plugin_namespace_;
+};
+
+REGISTER_TRT_PLUGIN_V2(custom_op_plugin_creator);
+REGISTER_TRT_PLUGIN_V2(custom_op_dynamic_plugin_creator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
new file mode 100644
index 0000000000000..2a3ead9c8e684
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>  // NOLINT
+
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h"
+
+PD_BUILD_OP(custom_op)
+    .Inputs({"Input"})
+    .Outputs({"Output"})
+    .Attrs({
+        "float_attr",
+        "int_attr",
+        "bool_attr",
+        "string_attr",
+        "ints_attr",
+        "floats_attr",
+        "bools_attr",
+    });
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(CustomPluginCreater, StaticShapePlugin) {
+  framework::ProgramDesc prog;
+  auto *block = prog.MutableBlock(0);
+  auto *op = block->AppendOp();
+  framework::proto::OpDesc *op_desc = op->Proto();
+
+  op_desc->set_type("custom_op");
+  auto *input_var = op_desc->add_inputs();
+  input_var->set_parameter("Input");
+  *input_var->add_arguments() = "X";
+
+  auto *output_var = op_desc->add_outputs();
+  output_var->set_parameter("Output");
+  *output_var->add_arguments() = "Out";
+
+  auto *attr = op_desc->add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(1.0);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s("test_string_attr");
+
+  attr = op_desc->add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+  attr->add_ints(1);
+  attr->add_ints(2);
+  attr->add_ints(3);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+  attr->add_floats(1.0);
+  attr->add_floats(2.0);
+  attr->add_floats(3.0);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+  attr->add_bools(true);
+  attr->add_bools(false);
+  attr->add_bools(true);
+
+  // init trt engine
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15));
+  engine_->InitNetwork();
+
+  engine_->DeclareInput(
+      "X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3(2, 5, 5));
+
+  framework::Scope scope;
+
+  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+
+  auto &custom_plugin_tell = OpTeller::Global().GetCustomPluginTeller();
+
+  framework::OpDesc custom_op(*op_desc, nullptr);
+  CHECK_EQ((*custom_plugin_tell)(custom_op, false, false), true);
+
+  OpTeller::Global().SetOpConverterType("custom_op",
+                                        OpConverterType::CustomPluginCreater);
+
+  OpConverter converter;
+  converter.ConvertBlock(
+      *block->Proto(), {}, scope, engine_.get() /*TensorRTEngine*/);
+}
+
+TEST(CustomPluginCreater, DynamicShapePlugin) {
+  framework::ProgramDesc prog;
+  auto *block = prog.MutableBlock(0);
+  auto *op = block->AppendOp();
+  framework::proto::OpDesc *op_desc = op->Proto();
+
+  op_desc->set_type("custom_op");
+  auto *input_var = op_desc->add_inputs();
+  input_var->set_parameter("Input");
+  *input_var->add_arguments() = "X";
+
+  auto *output_var = op_desc->add_outputs();
+  output_var->set_parameter("Output");
+  *output_var->add_arguments() = "Out";
+
+  auto *attr = op_desc->add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+
+  // init trt engine
+  std::unique_ptr<TensorRTEngine> engine_;
+
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  std::map<std::string, std::vector<int>> optim_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  engine_.reset(new TensorRTEngine(5,
+                                   1 << 15,
+                                   AnalysisConfig::Precision::kFloat32,
+                                   nullptr,
+                                   0,
+                                   min_input_shape,
+                                   max_input_shape,
+                                   optim_input_shape));
+  engine_->InitNetwork();
+
+  LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape();
+  engine_->DeclareInput(
+      "X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(-1, 2, 5, 5));
+
+  framework::Scope scope;
+
+  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+
+  auto &custom_plugin_tell = OpTeller::Global().GetCustomPluginTeller();
+
+  framework::OpDesc custom_op(*op_desc, nullptr);
+  CHECK_EQ((*custom_plugin_tell)(custom_op, false, true), true);
+
+  OpTeller::Global().SetOpConverterType("custom_op",
+                                        OpConverterType::CustomPluginCreater);
+
+  OpConverter converter;
+  converter.ConvertBlock(
+      *block->Proto(), {}, scope, engine_.get() /*TensorRTEngine*/);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_TRT_CONVERTER(custom_plugin_creater)
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 5e748aad2375c..795f62a3e1e6a 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) {
   x_tensor->Resize(phi::make_ddim(dim_vec));
   x_tensor->mutable_data<float>(platform::CUDAPlace(0));
 
+  OpTeller::Global().SetOpConverterType("conv2d", OpConverterType::Default);
   OpConverter converter;
   converter.ConvertBlock(
       *block->Proto(), {"conv2d-Y"}, scope, engine_.get() /*TensorRTEngine*/);
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
new file mode 100644
index 0000000000000..1d75f0a7fbf49
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::DimsExprs GatherNdInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dims = inputs[0];
+  const int x_dims_size = inputs[0].nbDims;
+  const nvinfer1::DimsExprs index_dims = inputs[1];
+  const int index_dims_size = inputs[1].nbDims;
+
+  std::vector<const nvinfer1::IDimensionExpr*> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims.d[i]);
+  }
+
+  if (index_dims.d[index_dims_size - 1]->isConstant()) {
+    for (int i = index_dims.d[index_dims_size - 1]->getConstantValue();
+         i < x_dims_size;
+         ++i) {
+      result_dims.emplace_back(x_dims.d[i]);
+    }
+  }
+
+  nvinfer1::DimsExprs output;
+  output.nbDims = result_dims.size();
+  for (int i = 0; i < output.nbDims; i++) {
+    output.d[i] = result_dims[i];
+  }
+  return output;
+}
+PD_REGISTER_DYNAMIC_INFER_META_FN(gather_nd, GatherNdInferMeta);
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
new file mode 100644
index 0000000000000..0196d81754fdd
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <string>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using DynamicMetaFn =
+    nvinfer1::DimsExprs (*)(int output_index,
+                            const nvinfer1::DimsExprs* inputs,
+                            int nb_inputs,
+                            nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+                            const framework::OpDesc& op_desc);
+
+class DynamicMetaFnFactory {
+ public:
+  static DynamicMetaFnFactory& Instance() {
+    static DynamicMetaFnFactory g_meta_fn_map;
+    return g_meta_fn_map;
+  }
+
+  bool Contains(const std::string& op_name) const {
+    return meta_fn_map_.count(op_name) > 0;
+  }
+
+  void Insert(std::string op_name, DynamicMetaFn infer_meta_fn) {
+    PADDLE_ENFORCE_NE(
+        Contains(op_name),
+        true,
+        phi::errors::AlreadyExists(
+            "`%s` op's DynamicInferMetaFn has been registered.", op_name));
+    meta_fn_map_.insert({std::move(op_name), std::move(infer_meta_fn)});
+  }
+
+  const DynamicMetaFn& Get(const std::string& op_name) const {
+    auto it = meta_fn_map_.find(op_name);
+    PADDLE_ENFORCE_NE(
+        it,
+        meta_fn_map_.end(),
+        phi::errors::NotFound(
+            "`%s` op's DynamicInferMetaFn has been registered.", op_name));
+    return it->second;
+  }
+
+ private:
+  DynamicMetaFnFactory() = default;
+
+  paddle::flat_hash_map<std::string, DynamicMetaFn> meta_fn_map_;
+
+  DISABLE_COPY_AND_ASSIGN(DynamicMetaFnFactory);
+};
+
+struct DynamicMetaFnRegistrar {
+  DynamicMetaFnRegistrar(const char* op_name, DynamicMetaFn infer_meta_fn) {
+    DynamicMetaFnFactory::Instance().Insert(op_name, std::move(infer_meta_fn));
+  }
+
+  static void Touch() {}
+};
+
+#define PD_REGISTER_DYNAMIC_INFER_META_FN(op_name, dynamic_infer_meta_fn)   \
+  static paddle::inference::tensorrt::DynamicMetaFnRegistrar                \
+      registrar_dynamic_infer_meta_fn_for_##op_name(#op_name,               \
+                                                    dynamic_infer_meta_fn); \
+  int TouchDynamicMetaFnRegistrar_##op_name() {                             \
+    registrar_dynamic_infer_meta_fn_for_##op_name.Touch();                  \
+    return 0;                                                               \
+  }
+
+#define USE_TRT_DYNAMIC_INFER_META_FN(op_name)           \
+  extern int TouchDynamicMetaFnRegistrar_##op_name();    \
+  static int use_op_dynamic_infer_meta##op_name UNUSED = \
+      TouchDynamicMetaFnRegistrar_##op_name();
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
new file mode 100644
index 0000000000000..f31040772c93d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+USE_TRT_DYNAMIC_INFER_META_FN(gather_nd);
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 54742df384ba4..55457aa5827e4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -18,6 +18,11 @@
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
@@ -60,246 +65,16 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
   }
 
-  bool operator()(const std::string& op_type,
-                  const framework::OpDesc& desc,
-                  bool use_no_calib_int8) override {
-    if (use_no_calib_int8) {
-      return int8_teller_set.count(op_type);
-    } else {
-      return teller_set.count(op_type);
-    }
-  }
-
- private:
-  // use this set for no calib int8.
-  std::unordered_set<std::string> int8_teller_set{
-      "mul",
-      "matmul",
-      "conv2d",
-      "conv2d_fusion",
-      "pool2d",
-      "relu",
-      "elu",
-      "selu",
-      "softsign",
-      "softplus",
-      "stanh",
-      "thresholded_relu",
-      "exp",
-      "log",
-      "sqrt",
-      "abs",
-      "sin",
-      "cos",
-      "tan",
-      "sinh",
-      "cosh",
-      "asin",
-      "acos",
-      "atan",
-      "asinh",
-      "atanh",
-      "ceil",
-      "floor",
-      "erf",
-      "softmax",
-      "sigmoid",
-      "hard_swish",
-      "depthwise_conv2d",
-      "batch_norm",
-      "concat",
-      "tanh",
-      "pad",
-      "elementwise_add",
-      "elementwise_sub",
-      "elementwise_mul",
-      "elementwise_div",
-      "elementwise_pow",
-      "equal",
-      "dropout",
-      "prelu",
-      "conv2d_transpose",
-      "depthwise_conv2d_transpose",
-      "leaky_relu",
-      "fc",
-      "shuffle_channel",
-      "swish",
-      "silu",
-      "split",
-      "instance_norm",
-      "gelu",
-      "layer_norm",
-      "scale",
-      "stack",
-      "transpose2",
-      "transpose",
-      "top_k",
-      "top_k_v2",
-      "flatten2",
-      "flatten",
-      "gather",
-      "gather_nd",
-      "yolo_box",
-      "yolo_box_head",
-      "arg_max",
-      "roi_align",
-      "affine_channel",
-      "nearest_interp",
-      "anchor_generator",
-      "reduce_sum",
-      "reduce_mean",
-      "conv3d",
-      "conv3d_transpose",
-      "mish",
-      "nearest_interp_v2",
-      "bilinear_interp_v2",
-      "pool3d",
-      "deformable_conv",
-      "relu6",
-      "hard_sigmoid",
-      "clip",
-      "fused_embedding_eltwise_layernorm",
-      "multihead_matmul",
-      "skip_layernorm",
-      "slice",
-      "strided_slice",
-      "fused_preln_embedding_eltwise_layernorm",
-      "preln_residual_bias",
-      "c_allreduce_sum",
-      "c_allreduce_min",
-      "c_allreduce_max",
-      "c_allreduce_prod",
-      "roll",
-      "cast",
-      "preln_skip_layernorm",
-      "transformer_input_convert",
-      "recover_padding",
-      "remove_padding",
-      "fill_constant",
-      "sum",
-      "shape",
-      "squeeze2",
-      "unsqueeze2",
-      "layernorm_shift_partition"};
-  std::unordered_set<std::string> teller_set{
-      "mul",
-      "matmul",
-      "conv2d",
-      "conv2d_fusion",
-      "pool2d",
-      "relu",
-      "elu",
-      "selu",
-      "softsign",
-      "softplus",
-      "stanh",
-      "thresholded_relu",
-      "exp",
-      "log",
-      "sqrt",
-      "abs",
-      "sin",
-      "cos",
-      "tan",
-      "sinh",
-      "cosh",
-      "asin",
-      "acos",
-      "atan",
-      "asinh",
-      "atanh",
-      "ceil",
-      "floor",
-      "erf",
-      "softmax",
-      "sigmoid",
-      "hard_swish",
-      "depthwise_conv2d",
-      "batch_norm",
-      "concat",
-      "tanh",
-      "pad",
-      "elementwise_add",
-      "elementwise_sub",
-      "elementwise_mul",
-      "elementwise_div",
-      "elementwise_pow",
-      "equal",
-      "dropout",
-      "prelu",
-      "conv2d_transpose",
-      "depthwise_conv2d_transpose",
-      "leaky_relu",
-      "fc",
-      "shuffle_channel",
-      "swish",
-      "silu",
-      "split",
-      "instance_norm",
-      "gelu",
-      "layer_norm",
-      "scale",
-      "stack",
-      "transpose2",
-      "transpose",
-      "top_k",
-      "top_k_v2",
-      "flatten2",
-      "flatten",
-      "gather",
-      "gather_nd",
-      "yolo_box",
-      "yolo_box_head",
-      "arg_max",
-      "roi_align",
-      "affine_channel",
-      "nearest_interp",
-      "anchor_generator",
-      "reduce_sum",
-      "reduce_mean",
-      "conv3d",
-      "conv3d_transpose",
-      "mish",
-      "bilinear_interp_v2",
-      "nearest_interp_v2",
-      "pool3d",
-      "deformable_conv",
-      "relu6",
-      "hard_sigmoid",
-      "clip",
-      "fused_embedding_eltwise_layernorm",
-      "multihead_matmul",
-      "skip_layernorm",
-      "slice",
-      "strided_slice",
-      "fused_preln_embedding_eltwise_layernorm",
-      "preln_skip_layernorm",
-      "preln_residual_bias",
-      "c_allreduce_sum",
-      "c_allreduce_min",
-      "c_allreduce_max",
-      "c_allreduce_prod",
-      "roll",
-      "cast",
-      "transformer_input_convert",
-      "recover_padding",
-      "remove_padding",
-      "fill_constant",
-      "sum",
-      "shape",
-      "squeeze2",
-      "unsqueeze2",
-      "fused_token_prune",
-      "layernorm_shift_partition"};
-};
-
-bool OpTeller::Tell(const framework::ir::Node* node,
-                    bool use_no_calib_int8,
-                    bool with_dynamic_shape) {
-  const std::string op_type = node->Op()->Type();
-  const framework::OpDesc desc = *node->Op();
-
-  for (auto& teller : tellers_) {
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    // do not support the op which is labeled the `skip_quant`
+    if ((desc.HasAttr("namescope") &&
+         PADDLE_GET_CONST(std::string, desc.GetAttr("op_namescope")) ==
+             "/skip_quant_2/") ||
+        desc.HasAttr("skip_quant"))
+      return false;
     std::unordered_set<std::string> act_op_list = {
         "relu",     "relu6", "sigmoid",
         "elu",      "selu",  "softsign",
@@ -2294,13 +2069,329 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       }
     }
 
-    if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
+    if (use_no_calib_int8) {
+      return int8_teller_set.count(op_type);
+    } else {
+      return teller_set.count(op_type);
+    }
   }
 
+ private:
+  // use this set for no calib int8.
+  std::unordered_set<std::string> int8_teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
+      "exp",
+      "log",
+      "sqrt",
+      "abs",
+      "sin",
+      "cos",
+      "tan",
+      "sinh",
+      "cosh",
+      "asin",
+      "acos",
+      "atan",
+      "asinh",
+      "atanh",
+      "ceil",
+      "floor",
+      "erf",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_sub",
+      "elementwise_mul",
+      "elementwise_div",
+      "elementwise_pow",
+      "equal",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "silu",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "top_k",
+      "top_k_v2",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "yolo_box_head",
+      "arg_max",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "bilinear_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "strided_slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_residual_bias",
+      "c_allreduce_sum",
+      "c_allreduce_min",
+      "c_allreduce_max",
+      "c_allreduce_prod",
+      "roll",
+      "cast",
+      "preln_skip_layernorm",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding",
+      "fill_constant",
+      "sum",
+      "shape",
+      "squeeze2",
+      "unsqueeze2",
+      "layernorm_shift_partition"};
+  std::unordered_set<std::string> teller_set{
+      "mul",
+      "matmul",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
+      "exp",
+      "log",
+      "sqrt",
+      "abs",
+      "sin",
+      "cos",
+      "tan",
+      "sinh",
+      "cosh",
+      "asin",
+      "acos",
+      "atan",
+      "asinh",
+      "atanh",
+      "ceil",
+      "floor",
+      "erf",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_sub",
+      "elementwise_mul",
+      "elementwise_div",
+      "elementwise_pow",
+      "equal",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "silu",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "top_k",
+      "top_k_v2",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "yolo_box_head",
+      "arg_max",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "bilinear_interp_v2",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "strided_slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm",
+      "preln_residual_bias",
+      "c_allreduce_sum",
+      "c_allreduce_min",
+      "c_allreduce_max",
+      "c_allreduce_prod",
+      "roll",
+      "cast",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding",
+      "fill_constant",
+      "sum",
+      "shape",
+      "squeeze2",
+      "unsqueeze2",
+      "fused_token_prune",
+      "layernorm_shift_partition"};
+};
+
+struct GenericPluginTeller : public Teller {
+ public:
+  GenericPluginTeller() {}
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    // only consider dynamic_shape mode
+    if (!with_dynamic_shape) {
+      return false;
+    }
+
+    if (use_no_calib_int8) {
+      return false;
+    } else {
+      framework::InitDefaultKernelSignatureMap();
+      bool res = phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_type) ||
+                 phi::DefaultKernelSignatureMap::Instance().Has(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no KernelSignature";
+        return false;
+      }
+      res = phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no CompatiblePhiKernel in phi.";
+        return false;
+      }
+      auto& dynamic_infermeta_factory =
+          tensorrt::DynamicMetaFnFactory::Instance();
+      res = dynamic_infermeta_factory.Contains(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no DynamicMetaFn.";
+        return false;
+      }
+      return true;
+    }
+  }
+};
+
+struct CustomPluginTeller : public Teller {
+ public:
+  CustomPluginTeller() {}
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    std::string expect_plugin_name;
+
+    if (with_dynamic_shape) {
+      expect_plugin_name = op_type + "_paddle_trt_dynamic_plugin";
+    } else {
+      expect_plugin_name = op_type + "_paddle_trt_plugin";
+    }
+
+    int num = 0;
+    auto creators = GetPluginRegistry()->getPluginCreatorList(&num);
+
+    for (int i = 0; i < num; i++) {
+      if (std::string(creators[i]->getPluginName()) == expect_plugin_name)
+        return true;
+    }
+    return false;
+  }
+};
+
+bool OpTeller::Tell(const framework::ir::Node* node,
+                    bool use_no_calib_int8,
+                    bool with_dynamic_shape) {
+  const std::string op_type = node->Op()->Type();
+  const framework::OpDesc desc = *node->Op();
+  auto& default_teller = GetDefaultTeller();
+  if ((*default_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::Default);
+    return true;
+  }
+  auto& generic_plugin_teller = GetGenericPluginTeller();
+  if ((*generic_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::GenericPluginCreater);
+    return true;
+  }
+  auto& custom_plugin_teller = GetCustomPluginTeller();
+  if ((*custom_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::CustomPluginCreater);
+    return true;
+  }
   return false;
 }
 
-OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+OpTeller::OpTeller() {
+  tellers_.emplace_back(new tensorrt::SimpleOpTypeSetTeller);
+  tellers_.emplace_back(new tensorrt::GenericPluginTeller);
+  tellers_.emplace_back(new tensorrt::CustomPluginTeller);
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 1a6ce092a18b4..2fa3dc361217e 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -38,9 +38,9 @@ namespace tensorrt {
  * issues such as op_desc.
  */
 struct Teller {
-  virtual bool operator()(const std::string& op_type,
-                          const framework::OpDesc& desc,
-                          bool use_no_calib_int8) = 0;
+  virtual bool operator()(const framework::OpDesc& desc,
+                          bool use_no_calib_int8 = false,
+                          bool with_dynamic_shape = false) = 0;
 
   virtual ~Teller() = default;
 };
@@ -55,9 +55,15 @@ struct Teller {
  *};
  */
 
+enum class OpConverterType {
+  Default = 0,
+  GenericPluginCreater,
+  CustomPluginCreater
+};
 /*
  * class OpTeller helps to tell whether a fluid
- * operator can be transformed to a TensorRT layer.
+ * operator can be transformed to a TensorRT layer
+ * and use which kind of OpConverter
  */
 class OpTeller {
  public:
@@ -70,11 +76,26 @@ class OpTeller {
             bool use_no_calib_int8 = false,
             bool with_dynamic_shape = false);
 
+  std::unique_ptr<Teller>& GetDefaultTeller() { return tellers_.at(0); }
+
+  std::unique_ptr<Teller>& GetGenericPluginTeller() { return tellers_.at(1); }
+
+  std::unique_ptr<Teller>& GetCustomPluginTeller() { return tellers_.at(2); }
+
+  void SetOpConverterType(std::string name, OpConverterType type) {
+    op_converter_type_map_[name] = type;
+  }
+
+  const std::map<std::string, OpConverterType>& GetOpConverterTypeMap() const {
+    return op_converter_type_map_;
+  }
+
  private:
   OpTeller();
 
  private:
   std::vector<std::unique_ptr<Teller>> tellers_;
+  std::map<std::string, OpConverterType> op_converter_type_map_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index f602714f21150..9fe02cd731d82 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -32,7 +32,8 @@ list(
   c_allreduce_op_plugin.cu
   preln_residual_bias_plugin.cu
   fused_token_prune_op_plugin.cu
-  layernorm_shift_partition_op.cu)
+  layernorm_shift_partition_op.cu
+  generic_plugin.cu)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND TRT_FILES spmm_plugin.cu)
@@ -41,7 +42,13 @@ endif()
 nv_library(
   tensorrt_plugin
   SRCS ${TRT_FILES}
-  DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+  DEPS enforce
+       tensorrt_engine
+       prelu
+       tensor
+       bert_encoder_functor
+       tensorrt_dynamic_shape_infermeta_factory
+       tensorrt_plugin_arg_mapping_context)
 
 nv_test(
   test_split_plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
new file mode 100644
index 0000000000000..2fc6e881e8e5b
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
@@ -0,0 +1,463 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+void BuildPhiKernelContextAttr(const framework::OpDesc& op_desc,
+                               phi::KernelContext* kernel_context,
+                               const phi::KernelSignature& signature,
+                               const phi::Kernel& phi_kernel) {
+  const phi::KernelArgsDef& args_def = phi_kernel.args_def();
+  const auto& attr_names = signature.attr_names;
+  const auto& attr_defs = args_def.attribute_defs();
+
+  PADDLE_ENFORCE_EQ(
+      attr_names.size(),
+      attr_defs.size(),
+      platform::errors::InvalidArgument(
+          "The attr_names.size() should be equal to attr_defs.size()."));
+
+  framework::AttrReader attr_reader(op_desc.GetAttrMap());
+
+  for (size_t k = 0; k < attr_names.size(); ++k) {
+    auto attr_name = attr_names[k];
+    auto* attr_ptr = attr_reader.GetAttr(attr_name);
+    if (attr_ptr) {
+      switch (attr_defs[k].type_index) {
+        case phi::AttributeType::SCALAR: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(float, attr)));
+              break;
+            case framework::proto::AttrType::INT:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(int, attr)));
+              break;
+            case framework::proto::AttrType::STRING:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(std::string, attr)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        case phi::AttributeType::INT_ARRAY: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              kernel_context->EmplaceBackAttr(std::move(
+                  phi::IntArray(PADDLE_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              kernel_context->EmplaceBackAttr(std::move(
+                  phi::IntArray(PADDLE_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_context->EmplaceBackAttr(
+                  phi::IntArray({PADDLE_GET_CONST(int, attr)}));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        case phi::AttributeType::SCALARS: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<int32_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::LONGS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<int64_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOATS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<float>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOAT64S: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<double>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        default: {
+          auto& attr = *attr_ptr;
+          switch (attr_defs[k].type_index) {
+            case phi::AttributeType::FLOAT32:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(float, attr));
+              break;
+            case phi::AttributeType::INT32:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(int, attr));
+              break;
+            case phi::AttributeType::BOOL:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(bool, attr));
+              break;
+            case phi::AttributeType::INT64:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(int64_t, attr));
+              break;
+            case phi::AttributeType::INT32S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<int>, attr));
+              break;
+            case phi::AttributeType::DATA_TYPE: {
+              auto data_type = paddle::framework::TransToPhiDataType(
+                  static_cast<framework::proto::VarType::Type>(
+                      PADDLE_GET_CONST(int, attr)));
+              kernel_context->EmplaceBackAttr(data_type);
+            } break;
+            case phi::AttributeType::STRING:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::string, attr));
+              break;
+            case phi::AttributeType::INT64S:
+              switch (AttrTypeID(attr)) {
+                case framework::proto::AttrType::LONGS:
+                  kernel_context->EmplaceBackAttr(
+                      PADDLE_GET_CONST(std::vector<int64_t>, attr));
+                  break;
+                case framework::proto::AttrType::INTS: {
+                  const auto& vector_int_attr =
+                      PADDLE_GET_CONST(std::vector<int>, attr);
+                  const std::vector<int64_t> vector_int64_attr(
+                      vector_int_attr.begin(), vector_int_attr.end());
+                  kernel_context->EmplaceBackAttr(vector_int64_attr);
+                } break;
+                default:
+                  PADDLE_THROW(platform::errors::Unimplemented(
+                      "Unsupported cast op attribute `%s` to vector<int64_t> "
+                      "when ProtoAttr2PhiAttr.",
+                      attr_name));
+              }
+              break;
+            case phi::AttributeType::FLOAT32S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<float>, attr));
+              break;
+            case phi::AttributeType::STRINGS:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<std::string>, attr));
+              break;
+            case phi::AttributeType::BOOLS:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<bool>, attr));
+              break;
+            case phi::AttributeType::FLOAT64S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<double>, attr));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` when construct "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        }
+      }
+    }
+  }
+}
+
+GenericPlugin::GenericPlugin(
+    const paddle::framework::proto::OpDesc& proto_op_desc,
+    const InputOutPutVarInfo& in_out_info) {
+  proto_op_desc_ = proto_op_desc;
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+  proto_op_desc_.SerializeToString(&op_meta_data_);
+  inputs_data_type_ = in_out_info.inputs_data_type;
+  outputs_data_type_ = in_out_info.outputs_data_type;
+}
+
+GenericPlugin::GenericPlugin(
+    const paddle::framework::proto::OpDesc& proto_op_desc,
+    const std::vector<int>& inputs_data_type,
+    const std::vector<int>& outputs_data_type) {
+  proto_op_desc_ = proto_op_desc;
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+  proto_op_desc_.SerializeToString(&op_meta_data_);
+  inputs_data_type_ = inputs_data_type;
+  outputs_data_type_ = outputs_data_type;
+}
+
+GenericPlugin::GenericPlugin(void const* serial_data, size_t serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &inputs_data_type_);
+  DeserializeValue(&serial_data, &serial_length, &outputs_data_type_);
+  std::string op_meta_data((char*)(serial_data), serial_length);  // NOLINT
+  op_meta_data_ = std::move(op_meta_data);
+  proto_op_desc_.ParseFromString(op_meta_data_);
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+}
+
+int GenericPlugin::getNbOutputs() const TRT_NOEXCEPT {
+  int res = 0;
+  for (auto& i : op_desc_.Outputs()) {
+    if (!i.second.empty()) res += i.second.size();
+  }
+  return res;
+}
+
+int GenericPlugin::getNbInputs() const TRT_NOEXCEPT {
+  int res = 0;
+  for (auto& i : op_desc_.Inputs()) {
+    if (!i.second.empty()) res += i.second.size();
+  }
+  return res;
+}
+
+nvinfer1::IPluginV2DynamicExt* GenericPlugin::clone() const TRT_NOEXCEPT {
+  nvinfer1::IPluginV2DynamicExt* plugin =
+      new GenericPlugin(proto_op_desc_, inputs_data_type_, outputs_data_type_);
+  plugin->initialize();
+  return plugin;
+}
+
+void GenericPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
+  // inputs_data_type_
+  SerializeValue(&buffer, inputs_data_type_);
+  // outputs_data_type_
+  SerializeValue(&buffer, outputs_data_type_);
+  // serialize op_meta_data_
+  std::memcpy(buffer, op_meta_data_.c_str(), op_meta_data_.size());
+  reinterpret_cast<char*&>(buffer) += op_meta_data_.size();
+}
+
+bool GenericPlugin::supportsFormatCombination(
+    int pos,
+    const nvinfer1::PluginTensorDesc* in_out,
+    int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  return true;
+}
+
+nvinfer1::DataType GenericPlugin::getOutputDataType(
+    int index,
+    const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+int GenericPlugin::initialize() TRT_NOEXCEPT {
+  std::string op_type = op_desc_.Type();
+
+  phi::KernelSignature phi_kernel_signature;
+  if (phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_type)) {
+    const phi::ArgumentMappingFn* argument_mapping_func =
+        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_type);
+    PluginArgumentMappingContext argument_mapping_context(&op_desc_);
+    phi_kernel_signature = (*argument_mapping_func)(argument_mapping_context);
+  } else {
+    phi_kernel_signature =
+        phi::DefaultKernelSignatureMap::Instance().Get(op_type);
+  }
+
+  phi::KernelKey phi_kernel_key(
+      phi::Backend::GPU, phi::DataLayout::ANY, phi::DataType::FLOAT32);
+
+  PADDLE_ENFORCE_EQ(
+      phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type),
+      true,
+      platform::errors::Fatal("%s has no compatible phi kernel!",
+                              op_type.c_str()));
+
+  const phi::Kernel& phi_kernel = phi::KernelFactory::Instance().SelectKernel(
+      phi_kernel_signature.name, phi_kernel_key);
+  phi_kernel_ = &phi_kernel;
+
+  PADDLE_ENFORCE_EQ(phi_kernel_->IsValid(),
+                    true,
+                    platform::errors::Fatal("%s phi kernel is invalid!.",
+                                            phi_kernel_signature.name));
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+  auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(place));
+
+  phi_kernel_context_ = new phi::KernelContext(dev_ctx);
+  dense_tensor_inputs_ = new std::vector<phi::DenseTensor>(getNbInputs());
+  dense_tensor_outputs_ = new std::vector<phi::DenseTensor>(getNbOutputs());
+
+  BuildPhiKernelContextAttr(
+      op_desc_, phi_kernel_context_, phi_kernel_signature, phi_kernel);
+  return 0;
+}
+
+nvinfer1::DimsExprs GenericPlugin::getOutputDimensions(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
+  CHECK(output_index < getNbOutputs());
+  auto& dynamic_infermeta_factory = tensorrt::DynamicMetaFnFactory::Instance();
+  PADDLE_ENFORCE_EQ(dynamic_infermeta_factory.Contains(op_desc_.Type()),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The %s op has no dynamic plugin infershape function!",
+                        op_desc_.Type().c_str()));
+
+  auto* infershape_func = dynamic_infermeta_factory.Get(op_desc_.Type());
+  return infershape_func(
+      output_index, inputs, nb_inputs, expr_builder, op_desc_);
+}
+
+void GenericPlugin::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in,
+    int nb_inputs,
+    const nvinfer1::DynamicPluginTensorDesc* out,
+    int nb_outputs) TRT_NOEXCEPT {
+  CHECK(phi_kernel_context_);
+  CHECK(phi_kernel_);
+  CHECK(nb_inputs == getNbInputs());
+  CHECK(nb_outputs == getNbOutputs());
+}
+
+// Shutdown the layer. This is called when the engine is destroyed
+void GenericPlugin::terminate() TRT_NOEXCEPT {
+  delete phi_kernel_context_;
+  delete dense_tensor_inputs_;
+  delete dense_tensor_outputs_;
+}
+
+int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                           const nvinfer1::PluginTensorDesc* output_desc,
+                           const void* const* inputs,
+                           void* const* outputs,
+                           void* workspace,
+                           cudaStream_t stream) TRT_NOEXCEPT {
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+
+  // [TODO]now generic plugin do not support FP16 and INT8 precision
+  auto protoType2PhiType = [](int proto_type) -> phi::DataType {
+    if (proto_type ==
+        static_cast<int>(framework::proto::VarType_Type::VarType_Type_FP32))
+      return phi::DataType::FLOAT32;
+    else if (proto_type ==
+                 static_cast<int>(
+                     framework::proto::VarType_Type::VarType_Type_INT64) ||
+             proto_type ==
+                 static_cast<int>(
+                     framework::proto::VarType_Type::VarType_Type_INT32))
+      return phi::DataType::INT32;
+    else if (proto_type ==
+             static_cast<int>(
+                 framework::proto::VarType_Type::VarType_Type_BOOL))
+      return phi::DataType::BOOL;
+    else
+      CHECK(false) << "precision is not supported";
+  };
+
+  // input
+  for (int i = 0; i < getNbInputs(); i++) {
+    auto const& input_dims = input_desc[i].dims;
+
+    std::vector<int> input_shape;
+    for (int j = 0; j < input_dims.nbDims; j++)
+      input_shape.push_back(input_dims.d[j]);
+
+    int input_numel = 1;
+    for (int k = 0; k < input_shape.size(); k++) input_numel *= input_shape[k];
+
+    phi::DenseTensorMeta input_meta(protoType2PhiType(inputs_data_type_[i]),
+                                    phi::make_ddim(input_shape));
+    std::shared_ptr<phi::Allocation> input_alloc(
+        new phi::Allocation((void*)(inputs[i]),  // NOLINT
+                            input_numel * sizeof(int32_t),
+                            place));
+    (*dense_tensor_inputs_)[i] =
+        std::move(phi::DenseTensor(input_alloc, input_meta));
+    phi_kernel_context_->EmplaceBackInput(&((*dense_tensor_inputs_)[i]));
+  }
+
+  // output
+  for (int i = 0; i < getNbOutputs(); i++) {
+    auto const& output_dims = output_desc[i].dims;
+
+    std::vector<int> output_shape;
+    for (int j = 0; j < output_dims.nbDims; j++)
+      output_shape.push_back(output_dims.d[j]);
+
+    int output_numel = 1;
+    for (int k = 0; k < output_shape.size(); k++)
+      output_numel *= output_shape[k];
+
+    phi::DenseTensorMeta output_meta(protoType2PhiType(outputs_data_type_[i]),
+                                     phi::make_ddim(output_shape));
+    std::shared_ptr<phi::Allocation> output_alloc(
+        new phi::Allocation(reinterpret_cast<void*>(outputs[i]),
+                            output_numel * sizeof(float),
+                            place));
+    phi::DenseTensor output_densetonsor(output_alloc, output_meta);
+    (*dense_tensor_outputs_)[i] =
+        std::move(phi::DenseTensor(output_alloc, output_meta));
+    phi_kernel_context_->EmplaceBackOutput(&((*dense_tensor_outputs_)[i]));
+  }
+
+  (*phi_kernel_)(phi_kernel_context_);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h
new file mode 100644
index 0000000000000..39730937af2f5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+void BuildPhiKernelContextAttr(const framework::OpDesc& op_desc,
+                               phi::KernelContext* kernel_context,
+                               const phi::KernelSignature& signature,
+                               const phi::Kernel& phi_kernel);
+
+class GenericPlugin : public DynamicPluginTensorRT {
+ public:
+  struct InputOutPutVarInfo {
+    std::vector<int> inputs_data_type;
+    std::vector<int> outputs_data_type;
+  };
+
+ public:
+  GenericPlugin() {}
+
+  GenericPlugin(const paddle::framework::proto::OpDesc& proto_op_desc,
+                const InputOutPutVarInfo& in_out_info);
+
+  GenericPlugin(const paddle::framework::proto::OpDesc& proto_op_desc,
+                const std::vector<int>& inputs_data_type,
+                const std::vector<int>& outputs_data_type);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  GenericPlugin(void const* serialData, size_t serialLength);
+
+  // IPluginV2 method
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "generic_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  int getNbInputs() const TRT_NOEXCEPT;
+
+  // Initialize the layer for execution.
+  int initialize() TRT_NOEXCEPT override;
+
+  // Shutdown the layer. This is called when the engine is destroyed
+  void terminate() TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT{};
+
+  size_t getSerializationSize() const TRT_NOEXCEPT {
+    return op_meta_data_.size() + SerializedSize(inputs_data_type_) +
+           SerializedSize(outputs_data_type_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT;
+
+  // The Func in IPluginV2
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
+      TRT_NOEXCEPT;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nb_inputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nb_outputs) TRT_NOEXCEPT;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
+              const void* const* inputs,
+              void* const* outputs,
+              void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const TRT_NOEXCEPT;
+
+ private:
+  std::string op_meta_data_;
+  framework::proto::OpDesc proto_op_desc_;
+  framework::OpDesc op_desc_;
+
+ private:
+  phi::KernelContext* phi_kernel_context_;
+  const phi::Kernel* phi_kernel_;
+  std::vector<phi::DenseTensor>* dense_tensor_inputs_;
+  std::vector<phi::DenseTensor>* dense_tensor_outputs_;
+
+ private:
+  InputOutPutVarInfo in_out_info_;
+  std::vector<int> inputs_data_type_;
+  std::vector<int> outputs_data_type_;
+};
+
+class GenericPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "generic_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,
+                                                   const void* serial_data,
+                                                   size_t serial_length)
+      TRT_NOEXCEPT override {
+    return new GenericPlugin(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(GenericPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
index df404ae3e10e2..433ff37aac7bb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
@@ -125,10 +125,11 @@ class MishPluginDynamic : public DynamicPluginTensorRT {
   size_t getSerializationSize() const TRT_NOEXCEPT override;
   void serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
index 3963b48a26c6c..c6be871709452 100644
--- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
@@ -118,7 +118,28 @@ int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
   float* output = static_cast<float*>(outputs[0]);
-  const int32_t num_threads = 256;
+  int32_t num_threads;
+  if (input0_desc.dims.d[1] % 512 == 0) {
+    num_threads = 512;
+  } else if (input0_desc.dims.d[1] % 256 == 0) {
+    num_threads = 256;
+  } else if (input0_desc.dims.d[1] % 128 == 0) {
+    num_threads = 128;
+  } else if (input0_desc.dims.d[1] % 64 == 0) {
+    num_threads = 64;
+  } else if (input0_desc.dims.d[1] % 32 == 0) {
+    num_threads = 32;
+  } else if (input0_desc.dims.d[1] % 16 == 0) {
+    num_threads = 16;
+  } else if (input0_desc.dims.d[1] % 8 == 0) {
+    num_threads = 8;
+  } else if (input0_desc.dims.d[1] % 4 == 0) {
+    num_threads = 4;
+  } else if (input0_desc.dims.d[1] % 2 == 0) {
+    num_threads = 2;
+  } else {
+    num_threads = 1;
+  }
   const dim3 num_blocks(
       input1_desc.dims.d[0] - 1,
       input2_desc.dims.d[1],
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
index 418ecb015784f..9f1a1d6d2c109 100644
--- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
@@ -110,10 +110,29 @@ int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
   float* output = static_cast<float*>(outputs[0]);
-
   const auto input0_desc = inputDesc[0];
-
-  const int32_t num_threads = 256;
+  int32_t num_threads;
+  if (input0_desc.dims.d[2] % 512 == 0) {
+    num_threads = 512;
+  } else if (input0_desc.dims.d[2] % 256 == 0) {
+    num_threads = 256;
+  } else if (input0_desc.dims.d[2] % 128 == 0) {
+    num_threads = 128;
+  } else if (input0_desc.dims.d[2] % 64 == 0) {
+    num_threads = 64;
+  } else if (input0_desc.dims.d[2] % 32 == 0) {
+    num_threads = 32;
+  } else if (input0_desc.dims.d[2] % 16 == 0) {
+    num_threads = 16;
+  } else if (input0_desc.dims.d[2] % 8 == 0) {
+    num_threads = 8;
+  } else if (input0_desc.dims.d[2] % 4 == 0) {
+    num_threads = 4;
+  } else if (input0_desc.dims.d[2] % 2 == 0) {
+    num_threads = 2;
+  } else {
+    num_threads = 1;
+  }
   const dim3 num_blocks(
       input0_desc.dims.d[0],
       input0_desc.dims.d[1],
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
new file mode 100644
index 0000000000000..5d9998d255624
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+bool PluginArgumentMappingContext::HasInput(const std::string& name) const {
+  auto inputs = op_desc_ptr_->Inputs();
+  for (auto& i : inputs) {
+    if (i.first == name && !i.second.empty()) return true;
+  }
+  return false;
+}
+
+bool PluginArgumentMappingContext::HasOutput(const std::string& name) const {
+  auto outputs = op_desc_ptr_->Outputs();
+  for (auto& i : outputs) {
+    if (i.first == name && !i.second.empty()) return true;
+  }
+  return false;
+}
+
+bool PluginArgumentMappingContext::HasAttr(const std::string& name) const {
+  return op_desc_ptr_->HasAttr(name);
+}
+
+paddle::any PluginArgumentMappingContext::Attr(
+    const std::string& attr_name) const {
+  auto attr_type = op_desc_ptr_->GetAttrType(attr_name);
+  switch (attr_type) {
+    case framework::proto::AttrType::INT: {
+      return PADDLE_GET_CONST(int, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::FLOAT: {
+      return PADDLE_GET_CONST(float, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::STRING: {
+      return PADDLE_GET_CONST(std::string, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::INTS: {
+      return PADDLE_GET_CONST(std::vector<int>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::FLOATS: {
+      return PADDLE_GET_CONST(std::vector<float>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::STRINGS: {
+      return PADDLE_GET_CONST(std::vector<std::string>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::BOOLEAN: {
+      return PADDLE_GET_CONST(bool, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::BOOLEANS: {
+      return PADDLE_GET_CONST(std::vector<bool>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    default: {
+      LOG(ERROR) << "Can't conver op's attribute [" << attr_name
+                 << "] to paddle any.";
+    }
+  }
+  return paddle::any();
+}
+
+size_t PluginArgumentMappingContext::InputSize(const std::string& name) const {
+  return op_desc_ptr_->Inputs().at(name).size();
+}
+size_t PluginArgumentMappingContext::OutputSize(const std::string& name) const {
+  return op_desc_ptr_->Outputs().at(name).size();
+}
+bool PluginArgumentMappingContext::IsDenseTensorInput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsDenseTensorInputs(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsSelectedRowsInput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool PluginArgumentMappingContext::IsDenseTensorOutput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsSelectedRowsOutput(
+    const std::string& name) const {
+  return false;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
new file mode 100644
index 0000000000000..35229a5ab794e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
+ public:
+  explicit PluginArgumentMappingContext(framework::OpDesc* op_desc_ptr)
+      : op_desc_ptr_(op_desc_ptr) {}
+
+  bool HasInput(const std::string& name) const override;
+
+  bool HasOutput(const std::string& name) const override;
+
+  bool HasAttr(const std::string& name) const override;
+
+  paddle::any Attr(const std::string& attr_name) const override;
+
+  size_t InputSize(const std::string& name) const override;
+
+  size_t OutputSize(const std::string& name) const override;
+
+  bool IsDenseTensorInput(const std::string& name) const override;
+
+  bool IsDenseTensorInputs(const std::string& name) const override;
+
+  bool IsSelectedRowsInput(const std::string& name) const override;
+
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
+
+  bool IsDenseTensorOutput(const std::string& name) const override;
+
+  bool IsSelectedRowsOutput(const std::string& name) const override;
+
+  bool IsForInferShape() const override { return false; }
+
+ private:
+  framework::OpDesc* op_desc_ptr_;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
new file mode 100644
index 0000000000000..75716a91f574f
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(ArgMappingContexTest, BasicFunction) {
+  paddle::framework::proto::OpDesc op;
+  op.set_type("imaged_op");
+  auto *input_var = op.add_inputs();
+  input_var->set_parameter("X");
+  *input_var->add_arguments() = "input";
+
+  auto *output_var = op.add_outputs();
+  output_var->set_parameter("Out");
+  *output_var->add_arguments() = "output";
+
+  auto *attr = op.add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+
+  attr = op.add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(1.0);
+
+  attr = op.add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s("1");
+
+  attr = op.add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  attr = op.add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+  attr->add_ints(1);
+  attr->add_ints(2);
+
+  attr = op.add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+  attr->add_floats(1.0);
+  attr->add_floats(2.0);
+
+  attr = op.add_attrs();
+  attr->set_name("strings_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  attr->add_strings("1");
+  attr->add_strings("2");
+
+  attr = op.add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+  attr->add_bools(true);
+  attr->add_bools(true);
+
+  framework::OpDesc op_desc(op, nullptr);
+  PluginArgumentMappingContext context(&op_desc);
+
+  EXPECT_EQ(context.HasInput("X"), true);
+  EXPECT_EQ(context.HasOutput("Out"), true);
+  EXPECT_EQ(context.HasAttr("int_attr"), true);
+
+  int int_attr = any_cast<int>(context.Attr("int_attr"));
+  EXPECT_EQ(int_attr, 1);
+
+  float flaot_attr = any_cast<float>(context.Attr("float_attr"));
+  EXPECT_EQ(flaot_attr, 1);
+
+  std::string string_attr = any_cast<std::string>(context.Attr("string_attr"));
+  EXPECT_EQ(string_attr, "1");
+
+  bool bool_attr = any_cast<bool>(context.Attr("bool_attr"));
+  EXPECT_EQ(bool_attr, true);
+
+  std::vector<int> ints_attr =
+      any_cast<std::vector<int>>(context.Attr("ints_attr"));
+  EXPECT_EQ(ints_attr[0], 1);
+  EXPECT_EQ(ints_attr[1], 2);
+
+  std::vector<float> floats_attr =
+      any_cast<std::vector<float>>(context.Attr("floats_attr"));
+  EXPECT_EQ(floats_attr[0], 1.0);
+  EXPECT_EQ(floats_attr[1], 2.0);
+
+  std::vector<std::string> strings_attr =
+      any_cast<std::vector<std::string>>(context.Attr("strings_attr"));
+  EXPECT_EQ(strings_attr[0], "1");
+  EXPECT_EQ(strings_attr[1], "2");
+
+  std::vector<bool> bools_attr =
+      any_cast<std::vector<bool>>(context.Attr("bools_attr"));
+  EXPECT_EQ(bools_attr[0], true);
+  EXPECT_EQ(bools_attr[1], true);
+
+  EXPECT_EQ(context.InputSize("X"), true);
+  EXPECT_EQ(context.OutputSize("Out"), true);
+  EXPECT_EQ(context.IsDenseTensorInput("X"), false);
+  EXPECT_EQ(context.IsDenseTensorInputs("X"), false);
+  EXPECT_EQ(context.IsSelectedRowsInput("X"), false);
+  EXPECT_EQ(context.IsDenseTensorVectorInput("X"), false);
+
+  EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
+  EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
+  EXPECT_EQ(context.IsForInferShape(), false);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt
index 565bd670b98bf..872845d07999c 100644
--- a/paddle/fluid/jit/CMakeLists.txt
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -34,7 +34,8 @@ cc_library(
 cc_library(
   jit_function
   SRCS function.cc
-  DEPS jit_function_utils jit_executor_engine jit_pe_engine)
+  DEPS jit_function_utils jit_executor_engine jit_pe_engine
+       jit_interpreter_engine)
 
 cc_library(
   jit_layer
@@ -46,6 +47,7 @@ cc_library(
        jit_function_schema
        jit_executor_engine
        jit_pe_engine
+       jit_interpreter_engine
        jit_function)
 
 if(WITH_TESTING AND NOT WIN32)
@@ -65,10 +67,19 @@ if(WITH_TESTING AND NOT WIN32)
       feed_op
       fetch_op
       scale_op
+      transfer_layout_op
       jit_layer)
   cc_test(
     layer_test
     SRCS layer_test.cc
     DEPS ${JIT_DEPS})
   add_dependencies(layer_test jit_download_program)
+
+  cc_test(
+    layer_test_new
+    SRCS layer_test.cc
+    DEPS ${JIT_DEPS})
+  add_dependencies(layer_test_new jit_download_program)
+  set_tests_properties(layer_test_new PROPERTIES ENVIRONMENT
+                                                 "FLAGS_jit_engine_type=New")
 endif()
diff --git a/paddle/fluid/jit/engine/CMakeLists.txt b/paddle/fluid/jit/engine/CMakeLists.txt
index 92a1f9582c931..5626e9eb1fc67 100644
--- a/paddle/fluid/jit/engine/CMakeLists.txt
+++ b/paddle/fluid/jit/engine/CMakeLists.txt
@@ -7,3 +7,8 @@ cc_library(
   jit_pe_engine
   SRCS pe_engine.cc
   DEPS parallel_executor)
+
+cc_library(
+  jit_interpreter_engine
+  SRCS interpreter_engine.cc
+  DEPS standalone_executor)
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
new file mode 100644
index 0000000000000..410fd4dc01bed
--- /dev/null
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/engine/interpreter_engine.h"
+
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/new_executor/interpretercore.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace jit {
+
+InterpreterEngine::InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
+                                     const VariableMap &params_dict,
+                                     const phi::Place &place)
+    : info_(info), place_(place) {
+  info_->RemoveDescFeedFetch();
+  PADDLE_ENFORCE_GT(
+      static_cast<int64_t>(info_->ProgramDesc().Block(0).OpSize()),
+      0,
+      platform::errors::PreconditionNotMet(
+          "There is no operator in ProgramDesc."));
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, &scope_);
+  VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_);
+  CreateInterpreterCore();
+}
+
+void InterpreterEngine::CreateInterpreterCore() {
+  auto &program_desc = info_->ProgramDesc();
+
+  // apply inference pass
+  framework::ir::Graph graph{program_desc};
+  auto pass =
+      framework::ir::PassRegistry::Instance().Get("delete_dropout_op_x_pass");
+  pass->Apply(&graph);
+#ifdef PADDLE_WITH_MKLDNN
+  auto mkldnn_pass =
+      framework::ir::PassRegistry::Instance().Get("mkldnn_placement_pass");
+  mkldnn_pass->Set("mkldnn_enabled_op_types",
+                   new std::unordered_set<std::string>({}));
+  mkldnn_pass->Apply(&graph);
+#endif
+
+  GraphToProgram(graph, &converted_prog_, nullptr);
+
+  auto in_names = info_->InputArgNames();
+  auto out_names = info_->OutputArgNames();
+  std::set<std::string> skip_gc_vars;
+  skip_gc_vars.insert(in_names.begin(), in_names.end());
+  skip_gc_vars.insert(out_names.begin(), out_names.end());
+
+  inner_interpreter_ =
+      std::make_shared<InterpreterCore>(place_,
+                                        converted_prog_.Block(0),
+                                        /*skip_gc_vars=*/skip_gc_vars,
+                                        &scope_,
+                                        /*used_for_jit=*/true);
+}
+
+std::vector<Tensor> InterpreterEngine::operator()(
+    const std::vector<Tensor> &inputs) {
+  auto dense_tensors = utils::ToDenseTensors(inputs);
+  return utils::ToTensors(this->operator()(dense_tensors));
+}
+
+std::vector<DenseTensor> InterpreterEngine::operator()(
+    const std::vector<DenseTensor> &inputs) {
+  utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
+
+  // the latter can be moved to python side.
+  auto &feed_names = info_->InputArgNames();
+  auto &fetch_names = info_->OutputArgNames();
+  paddle::framework::FetchList outs = inner_interpreter_->Run(feed_names);
+
+  std::vector<DenseTensor> outputs;
+  utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
+  scope_.DropKids();
+
+  return outputs;
+}
+
+const std::shared_ptr<FunctionInfo> &InterpreterEngine::Info() const {
+  return info_;
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/engine/interpreter_engine.h b/paddle/fluid/jit/engine/interpreter_engine.h
new file mode 100644
index 0000000000000..8c7f43f297d22
--- /dev/null
+++ b/paddle/fluid/jit/engine/interpreter_engine.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+
+#include "paddle/fluid/jit/engine/base_engine.h"
+#include "paddle/fluid/jit/function_schema.h"
+#include "paddle/fluid/jit/function_utils.h"
+
+namespace paddle {
+
+namespace framework {
+class InterpreterCore;
+}  // namespace framework
+
+namespace jit {
+using InterpreterCore = framework::InterpreterCore;
+// using Graph = framework::ir::Graph;
+
+class InterpreterEngine : public BaseEngine {
+ public:
+  InterpreterEngine(const std::shared_ptr<FunctionInfo> &info,
+                    const VariableMap &params_dict,
+                    const phi::Place &place);
+
+  ~InterpreterEngine() noexcept {}
+
+  void CreateInterpreterCore();
+
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+
+  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+
+  const std::shared_ptr<FunctionInfo> &Info() const;
+
+ private:
+  std::shared_ptr<FunctionInfo> info_;
+  framework::Scope scope_;
+  phi::Place place_;
+  std::shared_ptr<framework::InterpreterCore> inner_interpreter_;
+  framework::ProgramDesc converted_prog_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc
index 360eebe8b4f9a..7007693aa83e0 100644
--- a/paddle/fluid/jit/layer_test.cc
+++ b/paddle/fluid/jit/layer_test.cc
@@ -38,6 +38,7 @@ USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(feed);
 USE_OP_ITSELF(fetch);
 USE_OP_ITSELF(scale);
+USE_OP_ITSELF(transfer_layout);
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 65a39bc7f9a56..8e8bb370e81c4 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/fluid/jit/engine/executor_engine.h"
+#include "paddle/fluid/jit/engine/interpreter_engine.h"
 #include "paddle/fluid/jit/engine/pe_engine.h"
 #include "paddle/fluid/jit/layer.h"
 #include "paddle/fluid/jit/property.h"
@@ -79,6 +80,12 @@ Layer Deserializer::operator()(const std::string& path,
       VLOG(3) << "Add function type: PEEngine. Function name: " << func_name;
       layer.SetEngine(func_name,
                       utils::MakeEngine<PEEngine>(info, params_dict, place));
+    } else if (FLAGS_jit_engine_type == "New") {
+      VLOG(3) << "Add function type: InterpreterEngine. Function name: "
+              << func_name;
+      layer.SetEngine(
+          func_name,
+          utils::MakeEngine<InterpreterEngine>(info, params_dict, place));
     } else {
       PD_THROW("Invalid JitLayer engine type.");
     }
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1337533f3bbfe..41160bf46cb4f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -607,7 +607,7 @@ class LogitOpMaker : public framework::OpProtoAndCheckerMaker {
                    "(float, default 1e-6f) the epsilon for input clamp bound")
         .SetDefault(1e-6f);
     AddComment(R"DOC(
-Logit Operator. 
+Logit Operator.
 
 this function is defined as follow:
 $ logit=ln\left ( {\frac {x} {1-x}} \right ) $
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index f4e7481bdd456..cd4a9fbdb332c 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -87,7 +87,7 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
-    
+
     The add position encoding calculates the output based on the input, alpha, beta.
     The size of each dimension of the parameters checked in the infer-shape.
   )DOC");
diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc
index 8642d572e2d27..833285615f169 100644
--- a/paddle/fluid/operators/addmm_op.cc
+++ b/paddle/fluid/operators/addmm_op.cc
@@ -39,29 +39,23 @@ class AddMMOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const {
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    int customized_type_value =
-        framework::OpKernelType::kDefaultCustomizedTypeValue;
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
-
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      int customized_type_value =
+          framework::OpKernelType::kDefaultCustomizedTypeValue;
       if (input_data_type == framework::DataTypeTrait<int8_t>::DataType() ||
           input_data_type == framework::DataTypeTrait<uint8_t>::DataType()) {
         customized_type_value = kMULMKLDNNINT8;
       }
+      return framework::OpKernelType(input_data_type,
+                                     ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN,
+                                     customized_type_value);
     }
 #endif
-
-    return framework::OpKernelType(input_data_type,
-                                   ctx.GetPlace(),
-                                   layout,
-                                   library,
-                                   customized_type_value);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -77,7 +71,7 @@ class AddMMOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 AddMM Operator.
 This operator is used to perform matrix multiplication for input $x$ and $y$ with coefficient $alpha$.
-$input$ with coefficient $beta$ is added to the final result. 
+$input$ with coefficient $beta$ is added to the final result.
 The equation is:
 
 $$Out = alpha * x * y + beta * input$$
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index a459196b7611d..1c0b8800f7bf5 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -177,7 +177,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
                   [x_14, x_15, x_16]]
                  [[x_21, x_22, x_23]
                   [x_24, x_25, x_26]]]
-    
+
         OutputShape = [2, 3, 5, 5]
 
     Step 1:
@@ -185,12 +185,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         Generate relative coordinates according to OutputShape.
         The values of relative coordinates are in the interval between -1 and 1.
         The shape of the relative coordinates is [2, H, W] as below:
-    
+
         C = [[[-1.  -1.  -1.  -1.  -1. ]
               [-0.5 -0.5 -0.5 -0.5 -0.5]
               [ 0.   0.   0.   0.   0. ]
               [ 0.5  0.5  0.5  0.5  0.5]
-              [ 1.   1.   1.   1.   1. ]] 
+              [ 1.   1.   1.   1.   1. ]]
              [[-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
               [-1.  -0.5  0.   0.5  1. ]
@@ -198,7 +198,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
               [-1.  -0.5  0.   0.5  1. ]]]
         C[0] is the coordinates in height axis and  C[1] is the coordinates in
         width axis.
-    
+
     Step2:
         Tanspose and reshape C to shape [H * W, 2] and append ones to last
         dimension. The we get:
diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc
index aa3cd5d4149c4..fa6bc1d6f7757 100644
--- a/paddle/fluid/operators/allclose_op.cc
+++ b/paddle/fluid/operators/allclose_op.cc
@@ -47,7 +47,7 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
                   "compared as equal. Default: :math:`False` .")
         .SetDefault(false);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
 .. math::
@@ -110,7 +110,7 @@ REGISTER_OP_VERSION(allclose)
                       "The added input 'Atol' is not"
                       "dispensable."))
     .AddCheckpoint(
-        R"ROC(Delete two float attributes [rtol] and [atol], 
+        R"ROC(Delete two float attributes [rtol] and [atol],
         then add 2 string attributes [atol, rtol]. Don't be surprised.
         This is because float cannot represent hight-precision
         floating-point values, and our framework doesn't support
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 3404209063ec8..a8d1f36f1159d 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -69,8 +69,8 @@ Check if input X contains all finite data, if yes, scale it by input Scale.
 $$Out = X / scale$$
 
 If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-Out should not be used, and its data may not be deterministic. 
+FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of
+Out should not be used, and its data may not be deterministic.
 Otherwise, FoundInfinite will be 0 (False).
 
 )DOC");
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
index c8dc8217ef210..03a5f734c2dc8 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -111,8 +111,8 @@ class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Stop updating loss scaling, and just zero inputs.")
         .SetDefault(false);
     AddComment(R"DOC(
-Update loss scaling according to overall gradients. If all gradients is 
-finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Update loss scaling according to overall gradients. If all gradients is
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio.
 Otherwise, loss scaling will decrease by decr_ratio after
 decr_every_n_nan_or_inf steps and each step some gradients are infinite.
 
diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc
index 5c18f4c6fcc7d..ccd5584e8dedf 100644
--- a/paddle/fluid/operators/angle_op.cc
+++ b/paddle/fluid/operators/angle_op.cc
@@ -16,9 +16,6 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#endif
 
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index 7938c1182d292..f17723bf83f65 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -58,9 +58,9 @@ class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Argsort operator
 
-Performs sorting on the input tensor along the given axis and outputs two 
-tensors, Output(Out) and Output(Indices). They reserve the same shape 
-with Input(X), and Output(Out) represents the sorted tensor while 
+Performs sorting on the input tensor along the given axis and outputs two
+tensors, Output(Out) and Output(Indices). They reserve the same shape
+with Input(X), and Output(Out) represents the sorted tensor while
 Output(Indices) gives the sorted order along the given axis Attr(axis).
 
  )DOC");
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 5fee66d968b73..89c817889f144 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -223,10 +223,10 @@ class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "'paddle/framework/lod_rank_table.h' for more details.");
     AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
     AddComment(
-        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor>
           and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
-          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
-          would be the output of RNN Op and the LoDRankTable would be build 
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor>
+          would be the output of RNN Op and the LoDRankTable would be build
           with RNN's input.)DOC");
   }
 };
diff --git a/paddle/fluid/operators/assign_pos_op.cc b/paddle/fluid/operators/assign_pos_op.cc
index ba1beaf834ee6..80412c7d6786a 100644
--- a/paddle/fluid/operators/assign_pos_op.cc
+++ b/paddle/fluid/operators/assign_pos_op.cc
@@ -62,7 +62,7 @@ class AssignPosOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 assign_pos_op Operator.
 
-Assign pos decides which tokens should be fetched belong to 
+Assign pos decides which tokens should be fetched belong to
 specially counter orderingly.
 
 )DOC");
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 60e5912c4418d..203ccd8e6034d 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -297,7 +297,7 @@ tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu
 
 fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu
 
-dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) 
+dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M)
 
 LSTM part:
 use lstm_x_t as input and compute as standard LSTM.
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index a4a3f3cd2b054..84f22ebff4084 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -195,18 +195,16 @@ framework::OpKernelType BatchNormOp::GetExpectedKernelType(
           "Variance input should be of float type"));
 
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
+  if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+    return framework::OpKernelType(input_data_type,
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kMKLDNN,
+                                   framework::LibraryType::kMKLDNN);
   }
 #endif
 
-  return framework::OpKernelType(
-      input_data_type, ctx.GetPlace(), layout, library);
+  return framework::OpKernelType(input_data_type, ctx.GetPlace());
 }
 
 framework::OpKernelType BatchNormOp::GetKernelTypeForVar(
@@ -396,19 +394,18 @@ framework::OpKernelType BatchNormGradOp::GetExpectedKernelType(
   }
 
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
   auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-  if (library == framework::LibraryType::kPlain &&
-      this->CanMKLDNNBeUsed(ctx, data_type)) {
-    library = framework::LibraryType::kMKLDNN;
-    layout = framework::DataLayout::kMKLDNN;
+  if (this->CanMKLDNNBeUsed(ctx, data_type)) {
+    return framework::OpKernelType(data_type,
+                                   ctx.GetPlace(),
+                                   framework::DataLayout::kMKLDNN,
+                                   framework::LibraryType::kMKLDNN);
   }
 #endif
 
-  return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+  return framework::OpKernelType(data_type, ctx.GetPlace());
 }
 
 framework::OpKernelType BatchNormGradOp::GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc
index 305236134dbe1..b27594eed3a3e 100644
--- a/paddle/fluid/operators/bmm_op.cc
+++ b/paddle/fluid/operators/bmm_op.cc
@@ -44,8 +44,8 @@ class BmmOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor), The output tensor of Bmm op.");
     AddComment(R"DOC(
 The Bmm operator is used to perform batched matrix multiplication
-over the last two dimensions of the input tensors `X` and `Y` 
-which are both 3-dimentionsal. 
+over the last two dimensions of the input tensors `X` and `Y`
+which are both 3-dimentionsal.
 
 Examples:
 - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 6bce8ec566f25..4f681bc6508d2 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -54,7 +54,7 @@ class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
               "consistent with :code:`x`.")
         .AsDuplicable();
     AddComment(
-        R"DOC(This OP is used to broadcast a vector of inputs 
+        R"DOC(This OP is used to broadcast a vector of inputs
                      with Tensor or LoDTensor type, following broadcast semantics.)DOC");
   }
 };
diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc
index 15cc71565091c..f168eb10ae769 100644
--- a/paddle/fluid/operators/center_loss_op.cc
+++ b/paddle/fluid/operators/center_loss_op.cc
@@ -80,10 +80,10 @@ class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("need_update", "whether need to update center info.");
     AddComment(R"DOC(
 **CenterLoss operator**
-implemention of the center loss function in the papper<<A Discriminative 
+implemention of the center loss function in the papper<<A Discriminative
 Feature Learning Approach for Deep Face Recognition>>, equations in this  implement
 is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer )
-and y(Label) the target label 
+and y(Label) the target label
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/channel_shuffle_op.cc b/paddle/fluid/operators/channel_shuffle_op.cc
index 4d0e47157e606..69f75691a0318 100644
--- a/paddle/fluid/operators/channel_shuffle_op.cc
+++ b/paddle/fluid/operators/channel_shuffle_op.cc
@@ -52,9 +52,9 @@ class ChannelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
         while keeping the original tensor shape.
 
     Please refer to the paper:
-        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for 
+        `ShuffleNet: An Extremely Efficient Convolutional Neural Network for
         Mobile Devices <https://arxiv.org/abs/1707.01083>`_
-        by Zhang et. al (2017) for more details. 
+        by Zhang et. al (2017) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index b5aa051cc2871..6ad9f6d491ed7 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -145,7 +145,7 @@ For some basics of chunking, please refer to
 ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-   
+
           Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
    IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
    IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
@@ -158,13 +158,13 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-   
+
    tag_type = label % num_tag_type
    chunk_type = label / num_tag_type
 
 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-   
+
    Scheme Begin Inside End   Single
     plain   0     -      -     -
     IOB     0     1      -     -
diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
index b7600cbb4af41..f02946bddcc62 100644
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc
@@ -94,7 +94,7 @@ CINN(https://github.com/PaddlePaddle/CINN/blob/develop/README.md) instruction ex
 Both the input and output of this operator are a set of variables
 which are the input and output arguments of the bound cinn instruction respectively.
 In addition, there is an attribute named 'cached_index' should be
-set necessarily to get the CinnCompiledObject where the instruction is included 
+set necessarily to get the CinnCompiledObject where the instruction is included
 and 'instruction_index' is fetch the instruction object from complied runtime prograrm.
 
 It accomplishes the execution of the instruction according to the following steps:
diff --git a/paddle/fluid/operators/class_center_sample_op.cc b/paddle/fluid/operators/class_center_sample_op.cc
index 57f8bfb71fb1f..cb766dae225c6 100644
--- a/paddle/fluid/operators/class_center_sample_op.cc
+++ b/paddle/fluid/operators/class_center_sample_op.cc
@@ -75,8 +75,8 @@ class ClassCenterSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     The process of sampling subset class centers is straightforward: 1) First select the positive class centers;
     2) Randomly sample negative class centers. Specifically, given a Label tensor, shape [batch_size], select all
     the positive class centers and randomly sample negative class centers, then remap the input label tensor using
-    the sampled class centers. Note that if the number of the positive class centers is greater than the input 
-    num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be 
+    the sampled class centers. Note that if the number of the positive class centers is greater than the input
+    num_samples, it keeps all the positive class centers and the shape of SampledLocalClassCenter will be
     [num_positive_class_centers]. The op supports CPU, single GPU and multi GPU.
 
     For more information, Partial FC: Training 10 Million Identities on a Single Machine
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index 03dfd1413f26f..5d81acb226b9c 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -80,8 +80,8 @@ class GlobalScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the result of global_scatter.");
     AddComment(R"DOC(
 Global Scatter Operator
-Scatter data in X which has been put together belong to one expert 
-to n_expert * world_size exeperts according to local_count 
+Scatter data in X which has been put together belong to one expert
+to n_expert * world_size exeperts according to local_count
 and receive tensors from n_expert * world_size experts according
 to global_count.
 )DOC");
diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc
index 146a769cd6623..cc1534773671d 100644
--- a/paddle/fluid/operators/complex_view_op.cc
+++ b/paddle/fluid/operators/complex_view_op.cc
@@ -41,8 +41,8 @@ class AsComplexOpMaker : public framework::OpProtoAndCheckerMaker {
 As_complex Operator.
 
 This operator is used to return a complex tensor represented
-by an old-fashioned real tensor. The size of the last dimension of 
-the input tensor should be 2, which corresponds to 'real' and 
+by an old-fashioned real tensor. The size of the last dimension of
+the input tensor should be 2, which corresponds to 'real' and
 'complex', respectively.
 
 )DOC");
@@ -75,7 +75,7 @@ class AsRealOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 AsReal Operator.
 
-This operator is used to return an old-fashioned real tensor from a 
+This operator is used to return an old-fashioned real tensor from a
 complex tensor. The size of the last dimension of the output tensor is 2,
 which corresponds to 'real' and 'complex', respectively.
 
diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc
index 7b48fb5f7326a..0452574698a20 100644
--- a/paddle/fluid/operators/controlflow/compare_all_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_all_op.cc
@@ -38,7 +38,7 @@ class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                         comment.equation));
     AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. X, Y is a
-N-dim tensor, which could be any type. If all element $%s$, the Out tensor 
+N-dim tensor, which could be any type. If all element $%s$, the Out tensor
 is [True], else [False]
 )DOC",
                                comment.equation));
diff --git a/paddle/fluid/operators/controlflow/depend_op.cc b/paddle/fluid/operators/controlflow/depend_op.cc
index 29403ef570be1..24df91cf83f32 100644
--- a/paddle/fluid/operators/controlflow/depend_op.cc
+++ b/paddle/fluid/operators/controlflow/depend_op.cc
@@ -73,7 +73,7 @@ b = opA(a)
 y = opB(x)
 
 if tensor b and tensor x has some inner dependency, for example, x share data with b,
-we need to add explicit dependency for x <- b, otherwise the these two operators may 
+we need to add explicit dependency for x <- b, otherwise the these two operators may
 be executed parellel in static graph. We can use depend op as below,
 
 b = opA(a)
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 5cc991d8f1c75..d883d2da291b2 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -80,9 +80,7 @@ framework::OpKernelType ConvTransposeOp::GetKernelTypeForVar(
     // op. Treat this as NCHW (default data_format value)
     if (dl != framework::DataLayout::kAnyLayout) {
       return framework::OpKernelType(
-          expected_kernel_type.data_type_,
-          tensor.place(),
-          framework::StringToDataLayout(data_format));
+          expected_kernel_type.data_type_, tensor.place(), dl);
     }
   }
 #endif
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
index 5c1bc8983e88c..a826f1d1b897d 100644
--- a/paddle/fluid/operators/copy_cross_scope_op.cc
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -140,9 +140,9 @@ class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
     AddComment(R"DOC(
-      This op is used by pipeline to copy tensors across micro batch scopes. 
-      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
-      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      This op is used by pipeline to copy tensors across micro batch scopes.
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position.
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of
       the current micro scope to the main scope.
     )DOC");
   }
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index ee3ff671ede2d..ae1086b623f13 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -58,9 +58,9 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-feature weights learned by the linear_chain_crf operator and performs decoding. 
-It implements the Viterbi algorithm which is a dynamic programming algorithm 
-for finding the most likely sequence of hidden states, called the Viterbi path, 
+feature weights learned by the linear_chain_crf operator and performs decoding.
+It implements the Viterbi algorithm which is a dynamic programming algorithm
+for finding the most likely sequence of hidden states, called the Viterbi path,
 that results in a sequence of observed tags.
 
 The output of this operator changes according to whether Input(Label) is given:
@@ -68,15 +68,15 @@ The output of this operator changes according to whether Input(Label) is given:
 1. Input(Label) is given:
    This happens in training. This operator is used to co-work with the chunk_eval
    operator.
-   When Input(Label) is given, the crf_decoding operator returns tensor with the 
-   sampe shape as Input(Label) whose values are fixed to be 0, indicating an 
-   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an 
+   When Input(Label) is given, the crf_decoding operator returns tensor with the
+   sampe shape as Input(Label) whose values are fixed to be 0, indicating an
+   incorrect prediction, or 1 indicating a tag is correctly predicted. Such an
    output is the input to chunk_eval operator.
 
 2. Input(Label) is not given:
    This is the standard decoding process.
 
-The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here 
+The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here
 the shape depends on the inputs are LoDTensors or common tensors, whose values
 range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 41e9d673d3fe2..f7c72c11ddfac 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -102,14 +102,14 @@ Crop Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are two ways to set the offsets:
-1. In runtime: Using the input 'Offsets', which is a Variable and can be 
-               output of other operators. This way is suitable for 
+1. In runtime: Using the input 'Offsets', which is a Variable and can be
+               output of other operators. This way is suitable for
                dynamic offsets.
-2. In network configuration: Using the attribute 'offsets', which will be 
-                             set in Python configure script. This way is 
+2. In network configuration: Using the attribute 'offsets', which will be
+                             set in Python configure script. This way is
                              suitable for fixed offsets.
-You CANNOT use these two ways at the same time. An exception will be raised 
-if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+You CANNOT use these two ways at the same time. An exception will be raised
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is
 not empty.
 
 There are two ways to set shape:
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 52106c74314a4..c75a5eaf86dac 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -180,26 +180,26 @@ CropTensor Operator.
 Crop input into output, as specified by offsets and shape.
 
 There are three ways to set the offsets:
-1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that 
-                         contains tensor variable in python configure script. 
+1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that
+                         contains tensor variable in python configure script.
                          This way is suitable for dynamic offsets.
-2. Input 'Offsets': It is a variable and can be output of other operators. 
+2. Input 'Offsets': It is a variable and can be output of other operators.
                     This way is suitable for dynamic offsets.
-3. Attribute 'offsets': It will be set in python configure script. This way 
+3. Attribute 'offsets': It will be set in python configure script. This way
                         is suitable for fixed offsets.
 
-You CANNOT use these three ways at the same time. An exception will be raised 
-if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is 
+You CANNOT use these three ways at the same time. An exception will be raised
+if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is
 not empty.
 
 There are three ways to set shape:
 1. Input 'ShapeTensor': It is a tensor list. It should be set as a list that contains
-                        tensor variable in python configure script. This way is suitable 
+                        tensor variable in python configure script. This way is suitable
                         for dynamic shape.
-2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable 
+2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable
                   for dynamic shape.
-2. Attribute 'shape': crop input X into the shape described by a list<int>. The size of shape 
-                      list should be the same as the dimension size of input X. This way is 
+2. Attribute 'shape': crop input X into the shape described by a list<int>. The size of shape
+                      list should be the same as the dimension size of input X. This way is
                       suitable for fixed shape.
 
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 0d98f5b75e4fb..41a0d6ad20b04 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -250,10 +250,10 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 CrossEntropy Operator.
 
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
+The matrix's second dimension(row length) is as same as the original last
+dimension, and the first dimension(column length) is the product of all other
+original dimensions. Then the softmax computation will take palce on each raw
 of flattened matrixs.
 
 It supports both standard cross-entropy and soft-label cross-entropy loss
@@ -385,10 +385,10 @@ class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Hard-label CrossEntropy Operator.
 
-The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
-The matrix's second dimension(row length) is as same as the original last 
-dimension, and the first dimension(column length) is the product of all other 
-original dimensions. Then the softmax computation will take palce on each raw 
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs.
+The matrix's second dimension(row length) is as same as the original last
+dimension, and the first dimension(column length) is the product of all other
+original dimensions. Then the softmax computation will take palce on each raw
 of flattened matrixs.
 
 Only support hard label.
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index dbab71e1619ec..7731b7207180a 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -93,12 +93,12 @@ and then delete all blanks in sequence.
     Output.dims = {8, 1}
     Output.LoD = [[0, 6, 8]]
 or Given:
-    Input.data = [[0, 1, 2, 2, 0, 4], 
-                  [0, 4, 5, 0, 6, 0], 
+    Input.data = [[0, 1, 2, 2, 0, 4],
+                  [0, 4, 5, 0, 6, 0],
                   [0, 7, 7, 7, 0, 0]]
     InputLength.data  = [[6],
                          [5],
-                         [4]],   
+                         [4]],
     Input.dims = {3, 6},
     Input.Lod = []
 And:
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 9faf615a1601d..f5fd56edef900 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -190,7 +190,7 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
 CUDNN LSTM implementation
 
 A four-gate Long Short-Term Memory network with no peephole connections.
-In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
 the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
 $$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
@@ -217,7 +217,7 @@ the cell input ct-1 and the previous layer input xt given matrices W, R and bias
 - $\tilde{c_t}$ is also called candidate hidden state,
   which is computed based on the current input and the previous hidden state.
 
-Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, 
+Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
 X represensts a matrix multiplication
 
 
diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc
index a37b9e951ff0c..cc7e568bac848 100644
--- a/paddle/fluid/operators/cumprod_op.cc
+++ b/paddle/fluid/operators/cumprod_op.cc
@@ -35,7 +35,7 @@ class CumprodOpMaker : public framework::OpProtoAndCheckerMaker {
         "（int), The dim along which the input tensors will be cumproded");
     AddComment(
         R"DOC(Cumprod operator. Return the cumprod results of the input elements along the dim.
-              For example, if input X is a tensor with rank 1 and N elements, the output will also be a tensor 
+              For example, if input X is a tensor with rank 1 and N elements, the output will also be a tensor
               with rank 1 and N elements, and elements y[i] = x[0] * x[1] * x[2] *...* x[i] (0<=i<N))DOC");
   }
 };
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index a4cfb82bf8aaa..4fc279e03a36f 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -200,18 +200,16 @@ class DataNormOp : public framework::OperatorWithKernel {
                             "bias input should be of float type"));
     }
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type,
+                                     ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
     }
 #endif
 
-    return framework::OpKernelType(
-        input_data_type, ctx.GetPlace(), layout, library);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -511,19 +509,18 @@ class DataNormGradOp : public framework::OperatorWithKernel {
     }
 
     // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    if (library == framework::LibraryType::kPlain &&
-        this->CanMKLDNNBeUsed(ctx, data_type)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
+    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
+      return framework::OpKernelType(data_type,
+                                     ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
     }
 #endif
 
-    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
+    return framework::OpKernelType(data_type, ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
index 789976c887bbe..6e12b25028b04 100644
--- a/paddle/fluid/operators/decode_jpeg_op.cc
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -61,9 +61,9 @@ class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker {
              "of the JPEG image. It is a tensor with rank 1.");
     AddOutput("Out", "The output tensor of DecodeJpeg op");
     AddComment(R"DOC(
-This operator decodes a JPEG image into a 3 dimensional RGB Tensor 
-or 1 dimensional Gray Tensor. Optionally converts the image to the 
-desired format. The values of the output tensor are uint8 between 0 
+This operator decodes a JPEG image into a 3 dimensional RGB Tensor
+or 1 dimensional Gray Tensor. Optionally converts the image to the
+desired format. The values of the output tensor are uint8 between 0
 and 255.
 )DOC");
     AddAttr<std::string>(
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc
index 0b817a8f422b1..ed70e54678981 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cc
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cc
@@ -73,13 +73,13 @@ class DeformableConvV1OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 **Deformable Convolution v1 Operator**
 
-Deformable Convolution is a new method based Convolution which feature has offset 
+Deformable Convolution is a new method based Convolution which feature has offset
 in spatial location.
 
-1. Get offset of each pixel in feature map with convolution layers which number 
+1. Get offset of each pixel in feature map with convolution layers which number
    of channels should be double of weight size.
 
-2. Add offset to pixel to get new location and the new value which are computed 
+2. Add offset to pixel to get new location and the new value which are computed
    directly through bilinear interpolation with four nearest pixel.
 
 3. Get the product of pixel and weight as result
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
index f83a4c04a8162..bac1bb04bc0dd 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -104,7 +104,7 @@ class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "W is thewidth of output. ");
     AddComment(R"DOC(
 **DeformablePSROIPooling Operator**
-DeformablePSROIPooling is a new method based Region of interest pooling 
+DeformablePSROIPooling is a new method based Region of interest pooling
 (also known as RoI pooling).
 The operator has four steps:
 
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index cd17a8c9883df..89650d62351f0 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -82,14 +82,14 @@ This operator clips input boxes to original input images.
 For each input box, The formula is given as follows:
 
        $$xmin = \max(\min(xmin, im_w - 1), 0)$$
-       $$ymin = \max(\min(ymin, im_h - 1), 0)$$     
+       $$ymin = \max(\min(ymin, im_h - 1), 0)$$
        $$xmax = \max(\min(xmax, im_w - 1), 0)$$
        $$ymax = \max(\min(ymax, im_h - 1), 0)$$
 
 where im_w and im_h are computed from ImInfo, the formula is given as follows:
 
        $$im_w = \round(width / im_scale)$$
-       $$im_h = \round(height / im_scale)$$ 
+       $$im_h = \round(height / im_scale)$$
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 53a9d04fb585f..5120f687dee0e 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -98,9 +98,9 @@ The Encoding schema described below:
 
     oy = (ty - py) / ph / pyv
 
-    ow = log(abs(tw / pw)) / pwv 
+    ow = log(abs(tw / pw)) / pwv
 
-    oh = log(abs(th / ph)) / phv 
+    oh = log(abs(th / ph)) / phv
 
 The Decoding schema described below:
 
@@ -116,11 +116,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
-encoded/decoded coordinates, width and height. 
+encoded/decoded coordinates, width and height.
 
-During Box Decoding, two modes for broadcast are supported. Say target box has 
+During Box Decoding, two modes for broadcast are supported. Say target box has
 shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior
-box will broadcast to target box along the assigned axis. 
+box will broadcast to target box along the assigned axis.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index d641a6fd41ef7..c1b7e1678d881 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -189,7 +189,7 @@ Decode the target bounding box with the prior_box information.
 The Decoding schema is described below:
 
     $$
-    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} 
+    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2}
     $$
     $$
     oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
@@ -205,11 +205,11 @@ where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height in decode_box. 
+decoded coordinates, width and height in decode_box.
 
 decode_box is obtained after box decode, then assigning schema is described below:
 
-For each prior_box, use the best non-background class's decoded values to 
+For each prior_box, use the best non-background class's decoded values to
 update the prior_box locations and get output_assign_box. So, the shape of
 output_assign_box is the same as PriorBox.
 )DOC");
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index ddb8685ee3ab8..48902f517967b 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -125,7 +125,7 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
 This operator concats all proposals from different images
  and different FPN levels. Then sort all of those proposals
 by objectness confidence. Select the post_nms_topN RoIs in
- total. Finally, re-sort the RoIs in the order of batch index. 
+ total. Finally, re-sort the RoIs in the order of batch index.
 )DOC");
   }
 };
@@ -145,7 +145,7 @@ REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
 REGISTER_OP_VERSION(collect_fpn_proposals)
     .AddCheckpoint(
         R"ROC(
-              Upgrade collect_fpn_proposals add a new input 
+              Upgrade collect_fpn_proposals add a new input
               [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
         paddle::framework::compatible::OpVersionDesc()
             .NewInput("MultiLevelRoIsNum",
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index eeda4c819e12a..15918030c024b 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -86,7 +86,7 @@ class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker {
                   "If true, im_shape pixel offset is 1.")
         .SetDefault(true);
     AddComment(R"DOC(
-This operator is the second version of generate_proposals op to generate 
+This operator is the second version of generate_proposals op to generate
 bounding box proposals for Faster RCNN.
 The proposals are generated for a list of images based on image
 score 'Scores', bounding box regression result 'BboxDeltas' as
@@ -96,9 +96,9 @@ boxes.
 
 The difference between this version and the first version is that the image
  scale is no long needed now, so the input requires im_shape instead of im_info.
-The change aims to unify the input for all kinds of objective detection 
-such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the 
-size on input image instead of original image which is slightly different 
+The change aims to unify the input for all kinds of objective detection
+such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the
+size on input image instead of original image which is slightly different
 to before and will not effect the result.
 
 )DOC");
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
index c31c630cd6ccd..5f46e9ab51bc2 100644
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -95,7 +95,7 @@ boxes in 'Y' are shared by all instance of the batched inputs of X.
 Given two boxes A and B, the calculation of IOU is as follows:
 
 $$
-IOU(A, B) = 
+IOU(A, B) =
 \\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
 $$
 
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 1c0d19d9d5937..1c755c62ebc1b 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -116,7 +116,7 @@ independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. Now this operator has one more
-output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of
 detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
index 163da3cdd9727..f3df3b228d7ee 100644
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -383,11 +383,11 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Mine hard examples Operator.
 This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the 
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
-The selected number is min(sample_size, max_negative_box_number) when mining_type is 
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
-when mining_type is max_negative, where the max_negative_box_number is the count of 
+For each image, selects the box with highest losses. subject to the condition that the
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative.
+The selected number is min(sample_size, max_negative_box_number) when mining_type is
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number)
+when mining_type is max_negative, where the max_negative_box_number is the count of
 MatchIndices elements with value -1.
 )DOC");
   }
diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
index 915b174f174c5..2f3b59db5c038 100644
--- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
+++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc
@@ -640,7 +640,7 @@ where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, widt
 and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
 anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`,
 `phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the
-decoded coordinates, width and height. 
+decoded coordinates, width and height.
 
 Then the top decoded prediction from all levels are merged followed by NMS.
 In the NMS step, this operator prunes away boxes that have high IOU
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 8fbfe2ad8548c..c6e4c00f79bba 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -661,7 +661,7 @@ The rest anchors would not contibute to the RPN training loss
 
 ScoreIndex is composed of foreground anchor indexes(positive labels) and
 background anchor indexes(negative labels). LocationIndex is exactly same
-as the foreground anchor indexes since we can not assign regression target to 
+as the foreground anchor indexes since we can not assign regression target to
 the background anchors.
 
 The classification targets(TargetLabel) is a binary class label (of being
@@ -730,16 +730,16 @@ class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
     This layer can be, for given the Intersection-over-Union (IoU) overlap
     between anchors and ground truth boxes, to assign classification and
     regression targets to each anchor, these target labels are used for
-    train retinanet. 
-    
+    train retinanet.
+
     Every anchor is assigned with a length C one-hot vector of
     classification targets, and a 4-vector of box regression targets,
     where C is the class number. The assignment rules are as followed:
-    
+
     1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
     IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
     than positive_overlap(0.5) with any ground-truth box.
-    
+
     2. Anchors are assigned to background when its IoU ratio is lower than
     negative_overlap (0.4) for all ground-truth boxes.
 
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
index 99deee3f72aea..5b8e6739bfbfd 100644
--- a/paddle/fluid/operators/detection/target_assign_op.cc
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -131,7 +131,7 @@ If id = MatchIndices[i][j] > 0,
     Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
     OutWeight[i][j] = 1.
 
-Otherwise, 
+Otherwise,
 
     Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
     OutWeight[i][j] = 0.
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index cbe0548f27541..3261f8fca3d20 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -192,19 +192,19 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.5);
     AddComment(R"DOC(
          This operator generates YOLO detection boxes from output of YOLOv3 network.
-         
+
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
+         should be the same, H and W specify the grid size, each grid point predict
          given number boxes, this given number, which following will be represented as S,
          is specified by the number of anchors. In the second dimension(the channel
          dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
          otherwise C should be equal to S * (6 + class_num). class_num is the object
-         category number of source dataset(such as 80 in coco dataset), so the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
-         also includes confidence score of the box and class one-hot key of each anchor 
+         category number of source dataset(such as 80 in coco dataset), so the
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
+         also includes confidence score of the box and class one-hot key of each anchor
          box.
 
-         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
          predictions should be as follows:
 
          $$
@@ -225,9 +225,9 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
 
          The logistic regression value of the 5th channel of each anchor prediction boxes
          represents the confidence score of each prediction box, and the logistic
-         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         regression value of the last :attr:`class_num` channels of each anchor prediction
          boxes represents the classifcation scores. Boxes with confidence scores less than
-         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of
          confidence scores and classification scores.
 
          $$
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index afdcfcd42baac..0448d7e5183c8 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -105,14 +105,14 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
          This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
-         
+
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict 
+         should be the same, H and W specify the grid size, each grid point predict
          given number bounding boxes, this given number, which following will be represented as S,
          is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-         dimension), C should be equal to S * (class_num + 5), class_num is the object 
-         category number of source dataset(such as 80 in coco dataset), so in the 
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         dimension), C should be equal to S * (class_num + 5), class_num is the object
+         category number of source dataset(such as 80 in coco dataset), so in the
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
          also includes confidence score of the box and class one-hot key of each anchor box.
 
          Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
@@ -135,21 +135,21 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          and :math:`p_w, p_h` is specified by anchors.
 
          As for confidence score, it is the logistic regression value of IoU between
-         anchor boxes and ground truth boxes, the score of the anchor box which has 
-         the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
+         anchor boxes and ground truth boxes, the score of the anchor box which has
+         the max IoU should be 1, and if the anchor box has IoU bigger than ignore
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consists of three major parts: box location loss,
-         objectness loss and classification loss. The L1 loss is used for 
-         box coordinates (w, h), sigmoid cross entropy loss is used for box 
+         objectness loss and classification loss. The L1 loss is used for
+         box coordinates (w, h), sigmoid cross entropy loss is used for box
          coordinates (x, y), objectness loss and classification loss.
 
-         Each groud truth box finds a best matching anchor box in all anchors. 
+         Each groud truth box finds a best matching anchor box in all anchors.
          Prediction of this anchor box will incur all three parts of losses, and
          prediction of anchor boxes with no GT box matched will only incur objectness
          loss.
 
-         In order to trade off box coordinate losses between big boxes and small 
+         In order to trade off box coordinate losses between big boxes and small
          boxes, box coordinate losses will be mutiplied by scale weight, which is
          calculated as follows.
 
@@ -165,12 +165,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          $$
 
          While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-         target will be smoothed when calculating classification loss, target of 
+         target will be smoothed when calculating classification loss, target of
          positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
          negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
-         While :attr:`GTScore` is given, which means the mixup score of ground truth 
-         boxes, all losses incured by a ground truth box will be multiplied by its 
+         While :attr:`GTScore` is given, which means the mixup score of ground truth
+         boxes, all losses incured by a ground truth box will be multiplied by its
          mixup score.
          )DOC");
   }
diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc
index d06f5739df049..1f7b5dbdce9c8 100644
--- a/paddle/fluid/operators/dgc_op.cc
+++ b/paddle/fluid/operators/dgc_op.cc
@@ -126,10 +126,10 @@ class DGCOpMaker : public framework::OpProtoAndCheckerMaker {
     DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
 
     This optimizer will do two things:
-        
+
         1. Compress the gradient by get TopK import value from tensor \
             and use it for allreduce to reduce network bandwidth.
-    
+
         2. Call momentum to optimize on the cost.
 
 )DOC");
diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc
index 0dc5d024ec4a8..45e5e51c410e8 100644
--- a/paddle/fluid/operators/diag_embed_op.cc
+++ b/paddle/fluid/operators/diag_embed_op.cc
@@ -47,11 +47,11 @@ class DiagEmbedOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(-1);
 
-    AddComment(R"DOC(Creates a tensor whose diagonals of certain 2D planes 
-              (specified by dim1 and dim2) are filled by input. 
-              To facilitate creating batched diagonal matrices, 
+    AddComment(R"DOC(Creates a tensor whose diagonals of certain 2D planes
+              (specified by dim1 and dim2) are filled by input.
+              To facilitate creating batched diagonal matrices,
               the 2D planes formed by the last two dimensions of the returned tensor
-              are chosen by default. 
+              are chosen by default.
               )DOC");
   }
 };
diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc
index 8ccc5ff3891b9..f7b2c4915662c 100644
--- a/paddle/fluid/operators/diag_op.cc
+++ b/paddle/fluid/operators/diag_op.cc
@@ -45,7 +45,7 @@ class DiagOpMaker : public framework::OpProtoAndCheckerMaker {
              "Diagonal values of square matrix. It is a tensor with rank 1.");
     AddOutput("Out", "A square matrix.");
     AddComment(R"DOC(
-    Return a square matrix with specified diagonal values. 
+    Return a square matrix with specified diagonal values.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index 8197b115cddcc..70de5a3bb7588 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -65,7 +65,7 @@ strings and their references.
 
 Edit distance, also called Levenshtein distance, measures how dissimilar two strings
 are by counting the minimum number of operations to transform one string into another.
-The operations include insertion, deletion, and substitution. 
+The operations include insertion, deletion, and substitution.
 
 For example, given hypothesis string A = "kitten" and reference B = "sitting",
 A will be transformed into B at least after two substitutions and one
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 610e5932b1c36..e722d5f7e6e99 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -216,47 +216,12 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInputX();
     AddInputY();
     AddOpOutput();
-
     AddAttr<int>("axis",
                  "(int, default -1). If X.dimension != Y.dimension,"
                  "Y.dimension must be a subsequence of x.dimension. And axis "
                  "is the start dimension index "
                  "for broadcasting Y onto X. ")
         .SetDefault(-1);
-    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>("x_data_format", "This parameter is no longer used.")
-        .SetDefault("")
-        .AsExtra();
-    AddAttr<std::string>("y_data_format", "This parameter is no longer used.")
-        .SetDefault("")
-        .AsExtra();
-    AddAttr<bool>(
-        "use_quantizer",
-        "(bool, default false) "
-        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>(
-        "mkldnn_data_type",
-        "(string, default \"float32\"). Data type of mkldnn kernel")
-        .SetDefault("float32")
-        .InEnum({"float32", "int8", "bfloat16"})
-        .AsExtra();
-    /* int8 parameters */
-    AddAttr<float>("Scale_x",
-                   "(float, default 1.0f), The quantize scale of X tensor")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<float>("Scale_y",
-                   "(float, default 1.0f), The quantize scale of Y tensor")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<float>("Scale_out",
-                   "(float, default 1.0f), The quantize scale of output data")
-        .SetDefault(1.0f)
-        .AsExtra();
     AddOpComment();
   }
 
diff --git a/paddle/fluid/operators/fill_any_op.cc b/paddle/fluid/operators/fill_any_op.cc
index 23d00e47b48b4..4e6929b445038 100644
--- a/paddle/fluid/operators/fill_any_op.cc
+++ b/paddle/fluid/operators/fill_any_op.cc
@@ -30,7 +30,7 @@ class FillAnyOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("value_int", "The int var to fill in Tensor").SetDefault(0);
     AddComment(R"DOC(Fill operator with backward;
-                Fill an tensor with `value`. 
+                Fill an tensor with `value`.
                 )DOC");
   };
 };
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
index 1bec91a54623a..a0ac46c4a6603 100644
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ b/paddle/fluid/operators/filter_by_instag_op.cc
@@ -80,15 +80,15 @@ class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("LossWeight", "(Tensor) loss weight.");
     AddOutput("IndexMap", "(LoDTensor) mapping from Out rows to X1 rows");
     AddComment(R"DOC(
-Filter By Instag Op 
+Filter By Instag Op
 
 This operator is used to filter embeded ins.
 
-There are 3 inputs. First is embeded ins, Second is tags for ins, 
+There are 3 inputs. First is embeded ins, Second is tags for ins,
 Third is tags to filter.
 
 There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number. 
+Third is the IndexMap from Out line number to X1 line number.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 149d2bdac3c02..1c4127b6fbf85 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -70,9 +70,9 @@ class FoldOpMaker : public framework::OpProtoAndCheckerMaker {
 **Fold Operator**
 
 This Operator is used to combines an array of sliding local blocks into a large containing
-tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each 
-combined value in the resulting large tensor by summing all values from all containing blocks. 
-Unfold extracts the values in the local blocks by copying from the large tensor. So, if the 
+tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each
+combined value in the resulting large tensor by summing all values from all containing blocks.
+Unfold extracts the values in the local blocks by copying from the large tensor. So, if the
 blocks overlap, they are not inverses of each other.
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 30badd3125588..90f6d34535196 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -432,8 +432,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   The fused_attention operator is the same as following pseudo codes:
 
-  // @input: [batch_size, seq_len, embed_dim] 
-  // @final_out: [batch_size, seq_len, num_heads, head_dim] 
+  // @input: [batch_size, seq_len, embed_dim]
+  // @final_out: [batch_size, seq_len, num_heads, head_dim]
   residual = input
   if (pre_layernorm)
     query = layer_norm(input);
@@ -447,7 +447,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     out = dropout(out);
     out = out * v;
     out = transpose(out, perm=[0, 2, 1, 3]);
-                
+
   }
   // out linear
   out = linear(out);
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
index 081a1ab0a0d2a..3e888a2e67fc7 100644
--- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
+++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cc
@@ -140,8 +140,8 @@ class FusedBiasDropoutResidualLnOpMaker
 
     AddComment(R"DOC(
     Add fused bias_dropout_residual_layer_norm op whose logic is as follows:
-    // @input: [batch_size, seq_len, embed_dim] 
-    // @final_out: [batch_size, seq_len, embed_dim] 
+    // @input: [batch_size, seq_len, embed_dim]
+    // @final_out: [batch_size, seq_len, embed_dim]
     y = layer_norm(residual + dropout(bias + x));
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index 2e6f991e41fa1..0823f391fd086 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -174,7 +174,7 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   Add fused attention op whose logic is as follows:
   {
-    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w) 
+    q = paddle.einsum('nbqa,ahc->nbqhc', q_data, self.query_w)
     k = paddle.einsum('nbka,ahc->nbkhc', m_data, self.key_w)
     v = paddle.einsum('nbka,ahc->nbkhc', m_data, self.value_w)
 
@@ -189,10 +189,10 @@ class FusedGateAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
                                     self.gating_w) + self.gating_b
         gate_values_1 = nn.functional.sigmoid(gate_values)
         weighted_avg *= gate_values_1
-    
+
     output = paddle.einsum('nbqhc,hco->nbqo', weighted_avg,
                           self.output_w) + self.output_b
-                
+
   }
     )DOC");
   }
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index f9366bace3387..d14e30a5f7f2a 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -164,32 +164,32 @@ class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias).");
     AddOutput("ReserveSpace",
-              R"DOC(Reserve GPU space to place 
-        auxiliary data pointer. It is used to pass auxiliary data pointer 
-        for fused_gemm_epilogue op. If not given (empty string), the 
+              R"DOC(Reserve GPU space to place
+        auxiliary data pointer. It is used to pass auxiliary data pointer
+        for fused_gemm_epilogue op. If not given (empty string), the
         auxiliary mode would not be enable.)DOC")
         .AsDispensable()
         .AsExtra();
 
     AddAttr<bool>(
         "trans_x",
-        R"DOC((bool, default false), Whether to transpose input tensor X 
-    or not. The input tensor X coulbe be more than two dimension. When 
-    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor X
+    or not. The input tensor X coulbe be more than two dimension. When
+    set trans_x=true, it would fully reverse X. For instant: X with shpae
     [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
         .SetDefault(false);
     AddAttr<bool>(
         "trans_y",
-        R"DOC((bool, default false), Whether to transpose input tensor Y 
-    or not. The input tensor Y should be two dimension. When 
-    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor Y
+    or not. The input tensor Y should be two dimension. When
+    set trans_y=true, it would transpose Y. For instant: Y with shpae
     [d0, d1] -> [d1, d0].)DOC")
         .SetDefault(false);
 
     AddAttr<std::string>(
         "activation",
-        R"DOC((string, default none), The activation function. It could be 
-    one of {none, relu, gelu}. When none is given, Act would be null 
+        R"DOC((string, default none), The activation function. It could be
+    one of {none, relu, gelu}. When none is given, Act would be null
     operations)DOC")
         .SetDefault("none");
 
@@ -337,9 +337,9 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias");
     AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias");
     AddInput("ReserveSpace",
-             R"DOC(A GPU space to fetch 
-        auxiliary data pointer. It is used to pass auxiliary data pointer 
-        for fused_gemm_epilogue_grad op. If not given (empty string), the 
+             R"DOC(A GPU space to fetch
+        auxiliary data pointer. It is used to pass auxiliary data pointer
+        for fused_gemm_epilogue_grad op. If not given (empty string), the
         auxiliary mode would not be enable.)DOC")
         .AsDispensable();
 
@@ -352,23 +352,23 @@ class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsDispensable();
     AddAttr<bool>(
         "trans_x",
-        R"DOC((bool, default false), Whether to transpose input tensor X 
-    or not. The input tensor X coulbe be more than two dimension. When 
-    set trans_x=true, it would fully reverse X. For instant: X with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor X
+    or not. The input tensor X coulbe be more than two dimension. When
+    set trans_x=true, it would fully reverse X. For instant: X with shpae
     [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC")
         .SetDefault(false);
     AddAttr<bool>(
         "trans_y",
-        R"DOC((bool, default false), Whether to transpose input tensor Y 
-    or not. The input tensor Y should be two dimension. When 
-    set trans_y=true, it would transpose Y. For instant: Y with shpae 
+        R"DOC((bool, default false), Whether to transpose input tensor Y
+    or not. The input tensor Y should be two dimension. When
+    set trans_y=true, it would transpose Y. For instant: Y with shpae
     [d0, d1] -> [d1, d0].)DOC")
         .SetDefault(false);
 
     AddAttr<std::string>(
         "activation_grad",
-        R"DOC((string, default none), The backward activation function. It could be 
-    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would 
+        R"DOC((string, default none), The backward activation function. It could be
+    one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would
     be null operations)DOC")
         .SetDefault("none");
 
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index 9556ed12880ae..e2d2cf071caba 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -251,7 +251,7 @@ void FusionGRUOpMaker::Make() {
       .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
-This operator fuse the fully-connected operator into GRU, 
+This operator fuse the fully-connected operator into GRU,
 more details can refer to GRU op.
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
index cb948ea59d241..6be6763492345 100644
--- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
+++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc
@@ -79,7 +79,7 @@ void FusionSquaredMatSubOpMaker::Make() {
   AddAttr<float>("scalar", "The scalar on output matrix.").SetDefault(1.f);
   AddComment(R"DOC(
     Fusion Squared Matrix and substrct operator.
-    
+
     ( (X * Y).^2 - (X.^2 * Y.^2) ) .* scalar
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc
index eab555985c205..2a8917f1c005d 100644
--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -219,7 +219,7 @@ void MultiGRUOpMaker::Make() {
       .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
-This operator fuse the fully-connected operator into GRU, 
+This operator fuse the fully-connected operator into GRU,
 more details can refer to GRU op.
 )DOC");
 }
diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc
index 5852a5c04bde6..779e28c85b72a 100644
--- a/paddle/fluid/operators/fused/resnet_unit_op.cc
+++ b/paddle/fluid/operators/fused/resnet_unit_op.cc
@@ -274,10 +274,10 @@ class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>("act_type", "The activation type to be fused.")
         .SetDefault("relu");
     AddComment(R"DOC(
-Fusion op of the basic unit of resnet block. 
+Fusion op of the basic unit of resnet block.
 
 The implementation is based on the latest fusion op interface in cuDNN v8.0.
-For more details: 
+For more details:
 https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t
 
 )DOC");
diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc
index 50ca45967b7bd..da43ab7588647 100644
--- a/paddle/fluid/operators/fused_token_prune_op.cc
+++ b/paddle/fluid/operators/fused_token_prune_op.cc
@@ -81,7 +81,7 @@ class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
             fused_token_prune op is used to fuse multiple ops to perform token pruning.
             In this op:
-                1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0. 
+                1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0.
                 2. The second dimension of X will be sorted by Attn.
                 3. The last (max_seq_len - slimmed_seq_len) lines of X will be pruned.
                 4. The remainning part of sorted X will output.
diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc
index 6ff1841786ab0..59648bc7d17eb 100644
--- a/paddle/fluid/operators/gather_nd_op.cc
+++ b/paddle/fluid/operators/gather_nd_op.cc
@@ -59,13 +59,13 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     Gather_Nd Operator.
 
-    This function is actually a high-dimensional extension of gather 
-    and supports for simultaneous indexing by multiple axes. Out is 
-    obtained by gathering slices from X into a tensor with shape 
+    This function is actually a high-dimensional extension of gather
+    and supports for simultaneous indexing by multiple axes. Out is
+    obtained by gathering slices from X into a tensor with shape
     Index.shape[:-1] + X.shape[Index.shape[-1]:].
 
     Example:
-   
+
     Given:
          X = [[[ 0,  1,  2,  3],
                [ 4,  5,  6,  7],
@@ -73,7 +73,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
               [[12, 13, 14, 15],
                [16, 17, 18, 19],
                [20, 21, 22, 23]]]
-       
+
          X.shape = (2, 3, 4)
 
    *Case 1:
@@ -81,7 +81,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
        Index = [[1]]
 
     we get:
-       Out = 
+       Out =
             [[12, 13, 14, 15],
              [16, 17, 18, 19],
              [20, 21, 22, 23]]
@@ -91,7 +91,7 @@ class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker {
        Index = [[0,2]]
 
     we get:
-        
+
        Out =  [8, 9, 10, 11]
 
    *Case 3:
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 80964323e6b01..8d92305eb6f15 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -161,7 +161,7 @@ REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
 REGISTER_OP_VERSION(gaussian_random)
     .AddCheckpoint(
         R"ROC(
-               Upgrade gaussian_random add new inputs [ShapeTensor] and [ShapeTensorList] 
+               Upgrade gaussian_random add new inputs [ShapeTensor] and [ShapeTensorList]
                and modify the attribute of [shape])ROC",
         paddle::framework::compatible::OpVersionDesc()
             .NewInput("ShapeTensor",
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index add87fdd3c112..a16544b8ba3de 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -100,7 +100,7 @@ class GeluOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) use approximation of gelu")
         .SetDefault(false);
     AddComment(R"DOC(
-Gelu Activation Operator. 
+Gelu Activation Operator.
 
 For more details, please refer to [Gaussian Error Linear Units](https://arxiv.org/pdf/1606.08415.pdf).
 
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index b954ecab704b4..c907ae2b704b8 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -83,10 +83,10 @@ Graph Learning Send_Recv combine operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
 
-This operator is mainly used in Graph Learning domain, and the main purpose is to reduce 
-intermediate memory consumption in the process of message passing. 
-Take `x` as the input tensor, we first use `src_index` to gather corresponding data, 
-and then use `dst_index` to update the corresponding position of output tensor in different 
+This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
+intermediate memory consumption in the process of message passing.
+Take `x` as the input tensor, we first use `src_index` to gather corresponding data,
+and then use `dst_index` to update the corresponding position of output tensor in different
 pooling types, like sum, mean, max, or min.
 
 )DOC");
diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc
index af16609df3ebd..6c38ee65e8758 100644
--- a/paddle/fluid/operators/graph_send_ue_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc
@@ -97,7 +97,7 @@ intermediate memory consumption in the process of message passing.
 
 Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
 Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
-and get the computation result. Then, use `dst_index` to update the corresponding position of output 
+and get the computation result. Then, use `dst_index` to update the corresponding position of output
 tensor in different pooling types, like sum, mean, max, or min.
 
 )DOC");
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 97ec937911526..12b18bc55e2eb 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -89,12 +89,12 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("zeros");
 
     AddComment(R"DOC(
-      This operation samples input X by using bilinear or nearest interpolation based on 
+      This operation samples input X by using bilinear or nearest interpolation based on
       flow field grid, which is usually generated by affine_grid. The grid of
-      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
-      with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-      (in width dimension) of input data x and grid_y is indexing the 3rd 
-      dimension (in height dimension), finally results is the bilinear 
+      shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
+      with shape [N, H, W] each, where grid_x is indexing the 4th dimension
+      (in width dimension) of input data x and grid_y is indexing the 3rd
+      dimension (in height dimension), finally results is the bilinear
       interpolation value or nearest value of 4 nearest corner points.
 
       For bilinear interpolation mode:
@@ -105,7 +105,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
         grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
       Step 2:
-        Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+        Indices input data X with grid (x, y) in each [H, W] area, and bilinear
         interpolate point value by 4 nearest points.
 
           wn ------- y_n ------- en
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 143862350b04d..f111a379e16fe 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -63,7 +63,7 @@ class HashOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor) Input tensor of hash operator.");
     AddOutput("Out", "(Tensor) Output tensor of hash operator.");
     AddComment(R"DOC(
-        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input. 
+        Execute `num_hash` times xxHash algorithm on all elements on second dimension of input.
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int64_t>("mod_by", "").SetDefault(100000);
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 0d1006658a492..6741af7638809 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -82,7 +82,7 @@ take any values from (-inf, inf), but the labels should be either -1 or 1.
 Then, the hinge loss is computed as follows:
 
 $$
-L_(x, y) = max(1 - y.x, 0) 
+L_(x, y) = max(1 - y.x, 0)
 $$
 
 Note that the labels passed as input will have values as either 0 or 1.
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index d044ae056ee12..4dddf28792a8a 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -61,7 +61,7 @@ class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Increment Operator.
 
-The equation is: 
+The equation is:
 $$Out = X + step$$
 
 )DOC");
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 7057b405458a6..0c5306e1d4f4a 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -30,10 +30,10 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Return the element of input at index");
 
     AddComment(R"DOC(
-    IndexSample OP returns the element of the specified location of X, 
-    and the location is specified by Index. 
+    IndexSample OP returns the element of the specified location of X,
+    and the location is specified by Index.
 
-    X tensor and Index tensor's shape must be 2-D, 
+    X tensor and Index tensor's shape must be 2-D,
     dimension at 0 which usually is batch size must be equal.
 
     The returned tensor has the same shape and dimensions as the Index tensor.
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 05b27f8d11a6a..4c77e8b5b56c6 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -452,25 +452,25 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
-          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          for nearest neighbor interpolation and \"bilinear\" for bilinear
           interpolation and \"linear\" for linear interpolation..
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width
           direction) on input tensor.
-           
-          Linear interpolation is the method of using a line connecting two known quantities 
-          to determine the value of an unknown quantity between the two known quantities. 
-          
-          Bilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. The key idea is 
-          to perform linear interpolation first in one direction, and then 
+
+          Linear interpolation is the method of using a line connecting two known quantities
+          to determine the value of an unknown quantity between the two known quantities.
+
+          Bilinear interpolation is an extension of linear interpolation for
+          interpolating functions of two variables (e.g. H-direction and
+          W-direction in this op) on a rectilinear 2D grid. The key idea is
+          to perform linear interpolation first in one direction, and then
           again in the other direction.
 
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          Trilinear interpolation is an extension of linear interpolation for
+          interpolating functions of three variables (e.g. D-direction,
+          H-direction and W-direction in this op) on a rectilinear 3D grid.
           The linear interpolation is performed on three directions.
 
           Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -478,24 +478,24 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
           smoother than corresponding surfaces obtained by bilinear interpolation or
           nearest-neighbor interpolation.
 
-          Align_corners and align_mode are optional parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method
           of interpolation can be selected by them.
-          
+
           Example:
 
           For scale:
-          
+
             if align_corners = True and out_{size}>1 :
 
               scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
-            
+
             else:
-              
+
               scale_{factor} = float(in_{size}/out_{size})
-            
-          
+
+
           Nearest neighbor interpolation:
-          
+
           if:
               align_corners = False
 
@@ -518,16 +518,16 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
-              
+
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
 
@@ -538,17 +538,17 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
-              
+
               D_out = (D_{in}+0.5) * scale_{factor} - 0.5
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
 
@@ -570,13 +570,13 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          For details of nearest neighbor interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
-          For details of bilinear interpolation, please refer to Wikipedia: 
+          For details of bilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Bilinear_interpolation
 
-          For details of trilinear interpolation, please refer to Wikipedia: 
+          For details of trilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Trilinear_interpolation
 
           For details of bicubic interpolation, please refer to Wikipedia:
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 07ecae637a7bf..62d9c547fa397 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -553,25 +553,25 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
-          for nearest neighbor interpolation and \"bilinear\" for bilinear 
+          for nearest neighbor interpolation and \"bilinear\" for bilinear
           interpolation and \"linear\" for linear interpolation..
 
           Nearest neighbor interpolation is to perform nearest neighbor interpolation
-          in both the 3rd dimension(in height direction) and the 4th dimension(in width 
+          in both the 3rd dimension(in height direction) and the 4th dimension(in width
           direction) on input tensor.
-           
-          Linear interpolation is the method of using a line connecting two known quantities 
-          to determine the value of an unknown quantity between the two known quantities. 
-          
-          Bilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of two variables (e.g. H-direction and 
-          W-direction in this op) on a rectilinear 2D grid. The key idea is 
-          to perform linear interpolation first in one direction, and then 
+
+          Linear interpolation is the method of using a line connecting two known quantities
+          to determine the value of an unknown quantity between the two known quantities.
+
+          Bilinear interpolation is an extension of linear interpolation for
+          interpolating functions of two variables (e.g. H-direction and
+          W-direction in this op) on a rectilinear 2D grid. The key idea is
+          to perform linear interpolation first in one direction, and then
           again in the other direction.
 
-          Trilinear interpolation is an extension of linear interpolation for 
-          interpolating functions of three variables (e.g. D-direction, 
-          H-direction and W-direction in this op) on a rectilinear 3D grid. 
+          Trilinear interpolation is an extension of linear interpolation for
+          interpolating functions of three variables (e.g. D-direction,
+          H-direction and W-direction in this op) on a rectilinear 3D grid.
           The linear interpolation is performed on three directions.
 
           Bicubic interpolation is an extension of cubic interpolation for interpolating
@@ -579,24 +579,24 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
           smoother than corresponding surfaces obtained by bilinear interpolation or
           nearest-neighbor interpolation.
 
-          Align_corners and align_mode are optional parameters,the calculation method 
+          Align_corners and align_mode are optional parameters,the calculation method
           of interpolation can be selected by them.
-          
+
           Example:
 
           For scale:
-          
+
             if align_corners = True and out_{size}>1 :
 
               scale_{factor} = (in_{size}-1.0)/(out_{size}-1.0)
-            
+
             else:
-              
+
               scale_{factor} = float(in_{size}/out_{size})
-            
-          
+
+
           Nearest neighbor interpolation:
-          
+
           if:
               align_corners = False
 
@@ -619,16 +619,16 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
-              
+
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,H_in,W_in)
               output: (N,C,H_out,W_out) where:
 
@@ -639,17 +639,17 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
-              
+
               D_out = (D_{in}+0.5) * scale_{factor} - 0.5
               H_out = (H_{in}+0.5) * scale_{factor} - 0.5
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
 
           else:
-           
+
               input : (N,C,D_in,H_in,W_in)
               output: (N,C,D_out,H_out,W_out) where:
 
@@ -671,13 +671,13 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-          For details of nearest neighbor interpolation, please refer to Wikipedia: 
+          For details of nearest neighbor interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
-          For details of bilinear interpolation, please refer to Wikipedia: 
+          For details of bilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Bilinear_interp_v2olation
 
-          For details of trilinear interpolation, please refer to Wikipedia: 
+          For details of trilinear interpolation, please refer to Wikipedia:
           https://en.wikipedia.org/wiki/Trilinear_interp_v2olation
 
           For details of bicubic interpolation, please refer to Wikipedia:
diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc
index 68d241e4ac28d..8d0cd10097f4b 100644
--- a/paddle/fluid/operators/isclose_op.cc
+++ b/paddle/fluid/operators/isclose_op.cc
@@ -46,7 +46,7 @@ class IscloseOpMaker : public framework::OpProtoAndCheckerMaker {
                   "compared as equal. Default: :math:`False` .")
         .SetDefault(false);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator checks if all :math:`x` and :math:`y` satisfy the condition:
 
 .. math::
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index dbd2eb763d1ed..decee5567b486 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -72,19 +72,19 @@ class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
          While :math:`x` is Input(X) and :math:`y` is Input(Target).
 
          While :attr:`reduction` is :attr:`none`, output loss is in
-         the same shape as Input(X), loss in each point is calculated 
+         the same shape as Input(X), loss in each point is calculated
          seperately and no reduction is applied.
-         
+
          While :attr:`reduction` is :attr:`mean`, output loss is in
          shape of [1] and loss value is the mean value of all losses.
-         
+
          While :attr:`reduction` is :attr:`sum`, output loss is in
          shape of [1] and loss value is the sum value of all losses.
-         
-         While :attr:`reduction` is :attr:`batchmean`, output loss is 
+
+         While :attr:`reduction` is :attr:`batchmean`, output loss is
          in shape of [1] and loss value is the sum value of all losses
          divided by batch size.
-         
+
          )DOC");
   }
 };
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index cede00d5b01ec..d4fed2db47ed0 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -63,14 +63,14 @@ class KronOpMaker : public framework::OpProtoAndCheckerMaker {
           Kron Operator.
 
           This operator computes the Kronecker product of two tensors, a
-          composite tensor made of blocks of the second tensor scaled by the 
+          composite tensor made of blocks of the second tensor scaled by the
           first.
 
           This operator assumes that the rank of the two tensors, $X$ and $Y$
-          are the same, if necessary prepending the smallest with ones. If the 
-          shape of $X$ is [$r_0$, $r_1$, ..., $r_N$] and the shape of $Y$ is 
-          [$s_0$, $s_1$, ..., $s_N$], then the shape of the output tensor is 
-          [$r_{0}s_{0}$, $r_{1}s_{1}$, ..., $r_{N}s_{N}$]. The elements are 
+          are the same, if necessary prepending the smallest with ones. If the
+          shape of $X$ is [$r_0$, $r_1$, ..., $r_N$] and the shape of $Y$ is
+          [$s_0$, $s_1$, ..., $s_N$], then the shape of the output tensor is
+          [$r_{0}s_{0}$, $r_{1}s_{1}$, ..., $r_{N}s_{N}$]. The elements are
           products of elements from $X$ and $Y$.
 
           The equation is:
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index 873ab62a3d246..72813e76c757e 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -92,23 +92,23 @@ class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LabelSmooth Operator.
 
-Label smoothing is a mechanism to regularize the classifier layer. In machine 
-learning, optimizing the log-likelihood of the correct label directly may 
-cause two problems. First, it may result in overfitting: if the model learns 
+Label smoothing is a mechanism to regularize the classifier layer. In machine
+learning, optimizing the log-likelihood of the correct label directly may
+cause two problems. First, it may result in overfitting: if the model learns
 to assign full probability to the ground-truth label for each training example,
-it is not guaranteed to generalize. Second, it encourages the differences 
-between the largest logit and all others to become large, reducing the ability 
-of the model to adapt. Label smoothing is proposed to encourage the model to 
-be less confident, which replaces the ground-truth label $y$ with the weighted 
+it is not guaranteed to generalize. Second, it encourages the differences
+between the largest logit and all others to become large, reducing the ability
+of the model to adapt. Label smoothing is proposed to encourage the model to
+be less confident, which replaces the ground-truth label $y$ with the weighted
 sum of itself and some fixed distribution $\mu$, i.e.
 
 $$
     \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
 $$
 
-where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
-$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
-$\mu$. This change in the ground-truth label is called label-smoothing 
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for
+$\mu$. This change in the ground-truth label is called label-smoothing
 regularization or LSR.
 
 See more details about label smoothing in https://arxiv.org/abs/1512.00567.
diff --git a/paddle/fluid/operators/logspace_op.cc b/paddle/fluid/operators/logspace_op.cc
index ac326004a1037..5e5e25a56dbca 100644
--- a/paddle/fluid/operators/logspace_op.cc
+++ b/paddle/fluid/operators/logspace_op.cc
@@ -54,11 +54,11 @@ class LogspaceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("dtype", "The output data type.");
     AddOutput("Out", "A sequence of numbers.");
     AddComment(R"DOC(
-        Return fixed number of logarithmical-evenly spaced values within a given 
-        interval. First entry is exponential of Start with base Base, and last 
-        entry is exponential of Stop with base Base. In the case when Num is 1, 
-        only exponential of Start with base Base is returned. If dtype is int32 
-        or int64, the decimal part of values will be truncated. 
+        Return fixed number of logarithmical-evenly spaced values within a given
+        interval. First entry is exponential of Start with base Base, and last
+        entry is exponential of Stop with base Base. In the case when Num is 1,
+        only exponential of Start with base Base is returned. If dtype is int32
+        or int64, the decimal part of values will be truncated.
         Like logspace function of numpy.
     )DOC");
   }
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.cc b/paddle/fluid/operators/lookup_table_dequant_op.cc
index f5b15af4a41eb..e0ca707ffa70d 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.cc
+++ b/paddle/fluid/operators/lookup_table_dequant_op.cc
@@ -114,7 +114,7 @@ Lookup Table Dequant Operator.
 
 The `W` input is a quantized parameter for the sake of saving memories.
 This operator first index embeddings with `Ids`,
-then dequantizes them and contact them as output (`Out`). 
+then dequantizes them and contact them as output (`Out`).
 
 The input Ids can carry the LoD (Level of Details) information,
 or not. And the output only shares the LoD information with input Ids.
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 84e4e5cd2cdf8..156fc55fb6b9a 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -259,11 +259,11 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
 
-LSTMP has a separate projection layer after the LSTM layer, projecting the 
-original hidden state to a lower-dimensional one, which is proposed to reduce 
-the number of total parameters and furthermore computational complexity for 
-the LSTM, espeacially for the case that the size of output units is relative 
-large (https://research.google.com/pubs/archive/43905.pdf). 
+LSTMP has a separate projection layer after the LSTM layer, projecting the
+original hidden state to a lower-dimensional one, which is proposed to reduce
+the number of total parameters and furthermore computational complexity for
+the LSTM, espeacially for the case that the size of output units is relative
+large (https://research.google.com/pubs/archive/43905.pdf).
 
 The formula is as follows:
 
@@ -291,14 +291,14 @@ denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
 is the activation, such as logistic sigmoid function, and
 $i, f, o$ and $c$ are the input gate, forget gate, output gate,
 and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$. Here $h$ is usually called the hidden 
-state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
-called the candidate hidden state, whose computation is based on the current 
+the cell output activation vector $h$. Here $h$ is usually called the hidden
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also
+called the candidate hidden state, whose computation is based on the current
 input and previous hidden state.
 
 The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\overline{act_h}$ is the activation function for the 
+used for them. $\overline{act_h}$ is the activation function for the
 projection output, usually using `identity` or same as $act_h$.
 
 Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc
index c6831f975c47a..923c14f3db0f6 100644
--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -24,7 +24,7 @@ namespace operators {
 class LUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddComment(R"DOC(LU decomposition, 
+    AddComment(R"DOC(LU decomposition,
                 Computes the LU factorization of a matrix or batches of matrices A.
                 )DOC");
     AddInput("X", "(Tensor) The input tensor, shape of (*,m,n)");
diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc
index 988cba43989e9..9f631a60c1556 100644
--- a/paddle/fluid/operators/lu_unpack_op.cc
+++ b/paddle/fluid/operators/lu_unpack_op.cc
@@ -24,7 +24,7 @@ namespace operators {
 class LU_UnpackOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddComment(R"DOC(Unpack L U and P to single matrix tensor, 
+    AddComment(R"DOC(Unpack L U and P to single matrix tensor,
                 unpack L and U matrix from LU, unpack permutation matrix Pmat from Pivtos .
                 )DOC");
     AddInput("X", "(Tensor) The input LU tensor, shape of (*,m,n)");
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index 44f77afee0005..47ed77cbfb4e9 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -102,19 +102,19 @@ class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 MarginRankLoss Operator.
 
 This operator measures the loss given a pair of training sample
-{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
-indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1`
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss
 is calculated as:
 
 $loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
 
 The attribute `margin` here helps make the predictions more robust.
-Denote the item ranked higher as the positive sample, otherwise the negative 
-sample. If the score of the two samples satisfies 
+Denote the item ranked higher as the positive sample, otherwise the negative
+sample. If the score of the two samples satisfies
 
 $positive sample - negative sample < margin$
 
-the pair of samples will contribute to the final loss, which will backpropagate 
+the pair of samples will contribute to the final loss, which will backpropagate
 and train the ranking model to enlarge the difference between the two scores.
 
 For batch input with size `batch_size`, `X1`, `X2` and `Label`
diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc
index 80313f156f1f2..820e754049a23 100644
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -230,9 +230,9 @@ void MatchMatrixTensorOpMaker::Make() {
       Match Matrix Tensor Operator
 
       This operator calculate X * W * Y, only support 2-D for X and Y.
-      the output is a level-1 LodTensor: 
+      the output is a level-1 LodTensor:
         level_0: dim_t
-      
+
       NOTE: only support 'float32' data type now.
 
     )DOC");
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 8f233d7650daf..3b32acc8d707e 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -193,8 +193,8 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker {
                   "doing multiplication")
         .SetDefault(false);
     AddComment(
-        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), 
-        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). 
+        R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K),
+        B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)).
         In addition, it also follows the broadcast rule which is similar as
         numpy.matmul.
 )DOC");
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
index 315708cc05ff4..0e75629f711a9 100644
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -87,10 +87,10 @@ class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
 mean-IOU Operator.
 Mean Intersection-Over-Union is a common evaluation metric for
 semantic image segmentation, which first computes the IOU for each
-semantic class and then computes the average over classes. 
-IOU is defined as follows: 
+semantic class and then computes the average over classes.
+IOU is defined as follows:
     IOU = true_positive / (true_positive + false_positive + false_negative).
-It is based on pixel level area while "IOU Similarity Operator" 
+It is based on pixel level area while "IOU Similarity Operator"
 is based on area of rectangle.
 
 )DOC");
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index 9fb06c5968cbd..273b1fe7c9e70 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -118,7 +118,7 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                  "6: dst is on CustomDevicePlace");
     AddComment(R"DOC(
     Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or
     NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
     You would have to update it if you want other more capacities.
 
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 924711d9a3fc9..0b95200c12828 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -65,7 +65,7 @@ Take: N tensors, each of which can be either scalr or 1-dimensional vector, and
 N-dimensional grids.
 
 Args:
-  tensors (list of tensor): if the input k tensors has (N1,), (N2,),..., (Nk,), then 
+  tensors (list of tensor): if the input k tensors has (N1,), (N2,),..., (Nk,), then
   the output tensors are all of size (N1, N2, ...., Nk).
 
 Example::
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 3665e035d4a7a..f8e57adc703c1 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -44,7 +44,7 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Total", "The samples count of current batch");
 
     AddComment(R"DOC(
-Accuracy Operator. 
+Accuracy Operator.
 
 It will print accuracy rate for classification.
 The accuracy is calculated as follows:
@@ -52,7 +52,7 @@ The accuracy is calculated as follows:
 $$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
 
 Both the input Out and Label can carry the LoD (Level of Details)
-information, or not. But the output only shares the LoD information 
+information, or not. But the output only shares the LoD information
 with the input Out(Inference).
 
 )DOC");
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index f8c9c9d86a995..000e31aad9ac9 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -214,10 +214,7 @@ class MatMulMKLDNNHandler
     }
     astream.wait();
 
-    auto format =
-        MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-    out->set_format(format);
-    out->set_layout(DataLayout::kMKLDNN);
+    out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims()));
   }
 
   std::shared_ptr<dnnl::memory> AcquireDstMemory(
@@ -651,10 +648,18 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
   auto &astream = MKLDNNDeviceContext::tls().get_stream();
   matmul_p->execute(astream, matmul_args);
   astream.wait();
-  auto format =
-      MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-  out->set_format(format);
-  out->set_layout(DataLayout::kMKLDNN);
+
+  // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
+  // permute
+  if (IsOutputFused(ctx) && !IsInt8<T_out>()) {
+    auto axis = ctx.Attr<std::vector<int>>("fused_transpose_Out");
+    auto permuted_md = dst_memory_p->get_desc().permute_axes(axis);
+    out->set_mem_desc(
+        permuted_md.reshape(phi::vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
+  }
 }
 
 template <typename T>
@@ -836,8 +841,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     reduction_p->execute(astream, reduction_args);
     astream.wait();
 
-    dx->set_format(paddle::platform::GetMKLDNNFormat(
-        dst_memory_p->get_desc().reshape(squeezed_dims)));
+    dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims));
   }
 
   std::vector<int64_t> ExtendDimsWithOnes(const std::vector<int64_t> &dims,
@@ -1119,9 +1123,8 @@ void MatMulGradMKLDNNKernel<T>::ExecuteMatMulGrad(
   matmul_p->execute(astream, matmul_args);
   astream.wait();
 
-  out->set_layout(framework::DataLayout::kMKLDNN);
-  out->set_format(platform::GetMKLDNNFormat(
-      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims()))));
+  out->set_mem_desc(
+      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
 }
 
 template <typename T>
@@ -1184,13 +1187,13 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
   if (dx) {
     if (dx_dims != x.dims()) {
       dx->Resize(dx_dims);
-      dx->set_format(x.format());
+      dx->set_mem_desc(x.mem_desc());
     }
   }
   if (dy) {
     if (dy_dims != y.dims()) {
       dy->Resize(dy_dims);
-      dy->set_format(y.format());
+      dy->set_mem_desc(y.mem_desc());
     }
   }
 }
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index e727a4fe9fb48..e9150b0c58f76 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -221,7 +221,7 @@ class MulPrimitiveFactory {
               to_void_cast<T>(x_tmp.data<T>()));
 
       x_tmp.Resize(data->dims());
-      x_tmp.set_format(platform::GetMKLDNNFormat(dst_mdesc));
+      x_tmp.set_mem_desc(dst_mdesc);
       data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims);
     } else {
       data_matrix = framework::ReshapeToMatrix(*data, num_col_dims);
@@ -235,11 +235,7 @@ class MulPrimitiveFactory {
                           const Tensor *in) {
     x_input_->set_data_handle(to_void_cast<XT>(in->data<XT>()));
     output_->set_data_handle(out->mutable_data<OT>(ctx.GetPlace()));
-
-    if (out->format() == MKLDNNMemoryFormat::undef) {
-      auto output_format = platform::GetMKLDNNFormat(*output_);
-      out->set_format((MKLDNNMemoryFormat)output_format);
-    }
+    out->set_mem_desc(output_->get_desc());
   }
 
   template <typename T>
@@ -272,7 +268,7 @@ class MulPrimitiveFactory {
     auto buffer_size = dst_desc.get_size();
 
     OT *output_data = output->mutable_data<OT>(ctx.GetPlace(), buffer_size);
-    output->set_format(paddle::platform::GetMKLDNNFormat(dst_desc));
+    output->set_mem_desc(dst_desc);
     return memory(dst_desc, engine_, to_void_cast<OT>(output_data));
   }
 
@@ -392,9 +388,10 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
     if (out_dims.size() != 2) {
       out->Resize(out_dims);
     }
-    out->set_layout(DataLayout::kMKLDNN);
-    out->set_format(platform::MKLDNNFormatForSize(out_dims.size(),
-                                                  MKLDNNMemoryFormat::nchw));
+
+    auto in_md = dnnl::memory::desc(*dnnl_primitive_desc_query_md(
+        mul.get_primitive_desc(), dnnl_query_dst_md, 0));
+    out->set_mem_desc(in_md.reshape(phi::vectorize<int64_t>(out->dims())));
   }
 };
 
@@ -442,10 +439,11 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
     matmul_p->execute(astream, matmul_args);
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    // plain output formats are enforced inside handler
-    out->set_format(platform::MKLDNNFormatForSize(
-        out->dims().size(), dnnl::memory::format_tag::nchw));
+    // This kernel is flattening dims so then we need to unflattened version
+    // that should be set in out reshape require plain layout, but
+    // MatmulV2MKLDNNHanlder enforces one so it should work
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
   }
 
  private:
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
index e23971c86ada8..a7c6bd28486f8 100644
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
@@ -72,11 +72,19 @@ class SliceMKLDNNKernel : public framework::OpKernel<T> {
       ends[i] = ends[i] < 0 ? x_vec_dims[axes[i]] + ends[i]
                             : std::min(ends[i], x_vec_dims[axes[i]]);
       offsets[axes[i]] = starts[i];
-      slice_dims[axes[i]] = ends[i] - starts[i];
+      slice_dims[axes[i]] =
+          std::max(static_cast<int64_t>(0), ends[i] - starts[i]);
     }
 
     out->Resize(phi::make_ddim(slice_dims));
 
+    // Note(0x45f): To support slice Tensors with shapes like [0, 0, 0].
+    if (!x->initialized()) {
+      out->mutable_data(x->place(), x->dtype());
+      out->set_layout(experimental::DataLayout::kMKLDNN);
+      return;
+    }
+
     dnnl::memory::data_type x_type =
         framework::ToMKLDNNDataType(framework::TransToProtoVarType(x->dtype()));
 
diff --git a/paddle/fluid/operators/mode_op.cc b/paddle/fluid/operators/mode_op.cc
index 75a55f377e3cc..fce3028ab75cf 100644
--- a/paddle/fluid/operators/mode_op.cc
+++ b/paddle/fluid/operators/mode_op.cc
@@ -51,7 +51,7 @@ class ModeOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1);
     AddAttr<bool>("keepdim", "Keep the dim that to reduce.").SetDefault(false);
     AddComment(R"DOC(
-This operator finds the mode of input Tensor. And outputs their values and indices as vectors. 
+This operator finds the mode of input Tensor. And outputs their values and indices as vectors.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 17f323d0bcba8..e0e64bb0c2680 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -86,7 +86,7 @@ Since target Y is not differentiable, calculating gradient for Y is illegal.
 The formula of modified huber loss is:
 
 $$
-L(y, f(x)) = 
+L(y, f(x)) =
 \begin{cases}
 (\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
              -4yf(x),    \quad \text{otherwise}
diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc
index 14191d018b74d..782b67d90e81f 100644
--- a/paddle/fluid/operators/nll_loss_op.cc
+++ b/paddle/fluid/operators/nll_loss_op.cc
@@ -82,10 +82,10 @@ The loss can be described as:
 
 $Out[i] = -X[Label[i]]*Weight[Label[i]]$
 
-It can also be used for higher dimension inputs, such as 2D images, by 
-providing an input of shape (batch_size, C, d1, d2, ..., dK), with 
-K >= 1, where K is the number of dimensions, and a Label of 
-appropriate shape. In the case of images, it computes NLL loss 
+It can also be used for higher dimension inputs, such as 2D images, by
+providing an input of shape (batch_size, C, d1, d2, ..., dK), with
+K >= 1, where K is the number of dimensions, and a Label of
+appropriate shape. In the case of images, it computes NLL loss
 per-pixel.
 
 )DOC");
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 76737f2bc35a7..9754628b1b8eb 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -54,7 +54,7 @@ y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
 $$
 
 where, $\sum {x^2}$ is calculated along the `axis` dimension.
-        
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index 372a71706ab5e..fd8e027092410 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -41,8 +41,6 @@ TEST(op_debug_str, test_unknown_dtype) {
   desc.SetOutput(framework::GradVarName("Y"), {framework::GradVarName("Y")});
   desc.SetAttr("axis", -1);
   desc.SetAttr("use_mkldnn", false);
-  desc.SetAttr("x_data_format", "");
-  desc.SetAttr("y_data_format", "");
 
   auto x_tensor = scope.Var("X")->GetMutable<framework::LoDTensor>();
   x_tensor->Resize(dim);
diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc
index ad1262a7d2d55..d058b890cbd9d 100644
--- a/paddle/fluid/operators/optimizers/dpsgd_op.cc
+++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc
@@ -116,7 +116,7 @@ class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Dpsgd Optimizer.
 
-We implement the Dpsgd optimizer according to CCS16 paper - 
+We implement the Dpsgd optimizer according to CCS16 paper -
 Deep Learning with Differential Privacy.
 
 Dpsgd updates:
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
index cc3c99f9b1129..e9d6ab77f4357 100644
--- a/paddle/fluid/operators/optimizers/lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -101,8 +101,8 @@ class LambOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
 
-LAMB Optimizer is designed to scale up the batch size of training without losing 
-accuracy, which supports adaptive element-wise updating and accurate layer-wise 
+LAMB Optimizer is designed to scale up the batch size of training without losing
+accuracy, which supports adaptive element-wise updating and accurate layer-wise
 correction. For more information, please refer to https://arxiv.org/abs/1904.00962.
 
 The updating of parameters follows:
@@ -121,7 +121,7 @@ r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon} \\
 w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
 $$
 
-where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the 
+where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the
 learning rate, $\lambda$ the weight decay rate.
 )DOC");
   }
diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
index f576827f9cadf..d3d45ad3c6ba6 100644
--- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
+++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc
@@ -62,11 +62,11 @@ class Pow2DecayWithLinearWarmupOpMaker
     AddComment(R"DOC(
 The Pow2DecayWithLinearWarmup learning rate scheduler.
 
-When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps 
+When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps
 
-When warmup_steps <= step_num <= total_steps, 
-   factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) 
-   lr = (base_lr - end_lr) * factor * factor + end_lr 
+When warmup_steps <= step_num <= total_steps,
+   factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps)
+   lr = (base_lr - end_lr) * factor * factor + end_lr
 
 When step_num > total_steps, lr = end_lr
 
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 072e39dd91cc0..de280a6788779 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -119,9 +119,9 @@ param = sign(prox\_param) / (1 + learning\_rate * l2) *
         \max(|prox\_param| - learning\_rate * l1 , 0)
 $$
 
-The paper that proposed Proximal GD: 
+The paper that proposed Proximal GD:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
-Here, we use the adagrad learning rate as specified here: 
+Here, we use the adagrad learning rate as specified here:
 (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
 
 )DOC");
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index 50676863678c1..2460b30fa26b0 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -92,7 +92,7 @@ Optimizer that implements the proximal gradient descent algorithm:
 prox\_param = param - learning\_rate * grad \\
 param = sign(prox\_param) / (1 + learning\_rate * l2) *
         \max(|prox\_param| - learning\_rate * l1, 0)
-$$        
+$$
 
 The paper that proposed Proximal Gradient Descent:
 (http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index d80a5d8900c40..3e923d34a0684 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -66,7 +66,7 @@ class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
         .SetDefault(false);
     AddComment(R"DOC(
-Rmsprop Optimizer. 
+Rmsprop Optimizer.
 
 $$
 MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 0f2873d73e768..66aef5fe4eaa2 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -775,7 +775,7 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("NCHW");
     AddComment(R"DOC(
 Pad2d Operator.
-Pad 2-d images according to 'paddings' and 'mode'. 
+Pad 2-d images according to 'paddings' and 'mode'.
 If mode is 'reflect', paddings[0] and paddings[1] must be no greater
 than height-1. And the width dimension has the same condition.
 
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 301c21b2fcdcf..6141e6e98bb60 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -113,7 +113,7 @@ class Pad3dOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("NCDHW");
     AddComment(R"DOC(
 Pad3d Operator.
-Pad 3-d images according to 'paddings' and 'mode'. 
+Pad 3-d images according to 'paddings' and 'mode'.
 If mode is 'reflect', paddings[0] and paddings[1] must be no greater
 than width-1. The height and depth dimension have the same condition.
 
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 962feb2ee9262..fb4a90ebd8ca9 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -59,7 +59,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Pad Operator.
 
-Pad input into output, as specified by paddings and pad_value. 
+Pad input into output, as specified by paddings and pad_value.
 The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
 Given:
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 148eb8806b5d8..eb8271edccf95 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -161,9 +161,9 @@ class PartialSumOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(-1);
     AddComment(R"DOC(
 PartialSum Operator.
-This Op can sum the vars by specifying the initial position(start_index) and length(length). 
+This Op can sum the vars by specifying the initial position(start_index) and length(length).
 This OP exists in contrib, which means that it is not shown to the public.
-Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be 
+Only 2-D Tensor or LodTensor input is supported. Slice and concat can only be
 performed along the second dimension.
 
 Examples:
diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc
index e73abba60ddce..64e73217068b3 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.cc
+++ b/paddle/fluid/operators/pixel_shuffle_op.cc
@@ -60,9 +60,9 @@ class PixelShuffleOpMaker : public framework::OpProtoAndCheckerMaker {
     		with a stride of :math:`1/r`.
 
 		Please refer to the paper:
-		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient
 		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    		by Shi et. al (2016) for more details. 
+    		by Shi et. al (2016) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/pixel_unshuffle_op.cc b/paddle/fluid/operators/pixel_unshuffle_op.cc
index 9e31a8567abc5..e0ac5283ab86b 100644
--- a/paddle/fluid/operators/pixel_unshuffle_op.cc
+++ b/paddle/fluid/operators/pixel_unshuffle_op.cc
@@ -55,9 +55,9 @@ class PixelUnshuffleOpMaker : public framework::OpProtoAndCheckerMaker {
 		This operation is the reversion of PixelShuffle operation.
 
 		Please refer to the paper:
-		 `Real-Time Single Image and Video Super-Resolution Using an Efficient 
+		 `Real-Time Single Image and Video Super-Resolution Using an Efficient
 		 Sub-Pixel Convolutional Neural Network <https://arxiv.org/abs/1609.05158v2>`_
-    		by Shi et. al (2016) for more details. 
+    		by Shi et. al (2016) for more details.
 
         )DOC");
   }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4ae7b2f1709a6..57aef714a0502 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -119,7 +119,7 @@ MaxPool2d Operator.
 The maxPooling2d with index operation calculates the output and the mask
 based on the input, ksize, strides, and paddings parameters. Input(X) and
 output(Out, Mask) are in NCHW format, where N is batch size, C is the
-number of channels, H is the height of the feature, 
+number of channels, H is the height of the feature,
 and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
@@ -136,12 +136,12 @@ The input(X) size and output(Out, Mask) size may be different.
        H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
-  
+
   For adaptive = true:
        $$
        H_{out} = ksize[0]   W_{out} = ksize[1]
        $$
-      
+
 
 )DOC");
   }
@@ -210,7 +210,7 @@ The maxpooling3d with index operation calculates the output and the mask
 based on the input and ksize, strides, paddings parameters.
 Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
-width of the feature, respectively. 
+width of the feature, respectively.
 Parameters(ksize, strides, paddings) are three elements.
 These three elements represent depth, height and width, respectively.
 The input(X) size and output(Out, Mask) size may be different.
@@ -227,7 +227,7 @@ The input(X) size and output(Out, Mask) size may be different.
        H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
        W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
-  
+
   For adaptive = true:
        $$
        D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt
index 9d24cf89af494..30e162a4dd2a9 100644
--- a/paddle/fluid/operators/prim_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt
@@ -36,7 +36,8 @@ set(PRIM_OP_SRCS
     pow_p_op.cc
     max_p_op.cc
     erf_p_op.cc
-    abs_p_op.cc)
+    abs_p_op.cc
+    cast_p_op.cc)
 
 cc_test(
   prim_op_test
diff --git a/paddle/fluid/operators/prim_ops/cast_p_op.cc b/paddle/fluid/operators/prim_ops/cast_p_op.cc
new file mode 100644
index 0000000000000..5c8b9ab45c6bc
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/cast_p_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class CastPrimOp : public framework::OperatorBase {
+ public:
+  CastPrimOp(const std::string &type,
+             const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator cast_p should not be excuted directly"));
+  }
+};
+
+class CastPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of cast_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of cast_p op.");
+    AddAttr<int>("dtype", "output data type");
+    AddComment(R"DOC(Autograd primitive cast_p operator.)DOC");
+  }
+};
+
+class CastPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
+  }
+};
+
+class CastPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_type = static_cast<framework::proto::VarType::Type>(
+        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
+    ctx->SetOutputDataType("Y", out_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(cast_p,
+                  paddle::operators::CastPrimOp,
+                  paddle::operators::CastPrimOpMaker,
+                  paddle::operators::CastPrimOpShapeInference,
+                  paddle::operators::CastPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc
index f3a4af3b674b3..e85e51d9ebebe 100644
--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
@@ -70,8 +70,8 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1);
     AddComment(R"Doc(
 Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-position-sensitive average pooling on regions of interest specified by input, takes as 
-input N position-sensitive score maps and a list of num_rois regions of interest. 
+position-sensitive average pooling on regions of interest specified by input, takes as
+input N position-sensitive score maps and a list of num_rois regions of interest.
 
 PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
     )Doc");
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index b86cd9538acea..6736cb4c87c07 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -79,7 +79,7 @@ class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       This operator takes a batch of instance, and do random cropping on each instance.
       It means that cropping positions differs on each instance, which is determined
-      by an uniform random generator. All cropped instances have the same shape, which 
+      by an uniform random generator. All cropped instances have the same shape, which
       is determined by the operator's attribute 'shape'.
     )DOC");
   }
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index 565707853e22c..78366efc53bf9 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -71,7 +71,7 @@ class RandpermOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Default: 0.")
         .SetDefault(0);
 
-    AddComment(R"DOC( 
+    AddComment(R"DOC(
 This operator returns a random permutation of integers from 0 to n-1.
 )DOC");
   }
diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc
index 716fc58d4187b..f68e1668aa9a7 100644
--- a/paddle/fluid/operators/rank_attention_op.cc
+++ b/paddle/fluid/operators/rank_attention_op.cc
@@ -146,7 +146,7 @@ class RankAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddComment(R"DOC(
 RankAttention Operator.
-This Op can calculate rank attention between input and rank_param, 
+This Op can calculate rank attention between input and rank_param,
 and rank_param gives the organization of data. Notice: It currently supports GPU device.
 This Op exists in contrib, which means that it is not shown to the public.
 )DOC");
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index b353b2992ce19..2daf8c5d6b186 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -155,7 +155,7 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
 RankLoss Operator.
 
 RankLoss operator for RankNet
-(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf).
 RankNet is a pairwise ranking model with
 one training sample consisting of a pair of doc A and B, and the label P
 indicating that A is ranked higher than B or not:
@@ -164,8 +164,8 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output score of RankNet for the two docs and 
-the label respectively, and yields the rank loss C_{i,j} using the following 
+(P_{i,j}), which represent the output score of RankNet for the two docs and
+the label respectively, and yields the rank loss C_{i,j} using the following
 equation:
 
 $$
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 32e772374b38a..617c47530c9e3 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -30,10 +30,10 @@ class RealOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor), The input tensor of real op.");
     AddOutput("Out", "(Tensor), The output tensor of real op.");
-    AddComment(R"DOC( 
-Real Operator. 
+    AddComment(R"DOC(
+Real Operator.
 
-This operator is used to get a new tensor containing real values 
+This operator is used to get a new tensor containing real values
 from a tensor with complex data type.
 
 )DOC");
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index c3f61d4d2b399..6778855bcb19f 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -70,7 +70,7 @@ X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
 indices in RankTable are [3, 0, 2, 1].
 Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
 
-**NOTE**: 
+**NOTE**:
 This operator sorts Input(X) according to a given LoDRankTable which does
 not need to be calculated according to Input(X). It can be calculated according
 to another different sequence, and then this operator sorts Input(X) according
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
index 4e99be4e521ac..810a73d89d217 100644
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -69,7 +69,7 @@ class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
             Out = [[11, 12, 13, 14, 15]
                    [6, 7, 8, 9, 10]
                    [1, 2, 3, 4, 5]].
-        
+
       Case 2:
         Given
             X = [[[1, 2, 3, 4]
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index c02dc492c33a0..922d255bbe20e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -116,17 +116,17 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
 **RoIAlign Operator**
 
 Region of interest align (also known as RoI align) is to perform
-bilinear interpolation on inputs of nonuniform sizes to obtain 
+bilinear interpolation on inputs of nonuniform sizes to obtain
 fixed-size feature maps (e.g. 7*7)
 
 Dividing each region proposal into equal-sized sections with
 the pooled_width and pooled_height. Location remains the origin
 result.
 
-In each ROI bin, the value of the four regularly sampled locations 
+In each ROI bin, the value of the four regularly sampled locations
 are computed directly through bilinear interpolation. The output is
 the mean of four locations.
-Thus avoid the misaligned problem.   
+Thus avoid the misaligned problem.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 0878c332477f1..c95e235aff98b 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -124,7 +124,7 @@ The operator has three steps:
 
 3. Copying these max values to the output buffer
 
-ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+ROI Pooling for Faster-RCNN. The link below is a further introduction:
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
     )DOC");
   }
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index 9f66073c36be2..7ac1d4b8d4508 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -85,10 +85,10 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
         "with shifts or size == 0")
         .SetDefault({});
     AddComment(R"DOC(
-    Roll the tensor along the given dimension(s). 
+    Roll the tensor along the given dimension(s).
     Elements that are shifted beyond the last position
     are re-introduced at the first position. If a dimension
-    is not specified, the tensor will be flattened before 
+    is not specified, the tensor will be flattened before
     rolling and then restored to the original shape.
     )DOC");
   }
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index fc39d174c90ae..1bf471641d5a5 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -100,20 +100,20 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 :strong:`Row-convolution operator`
 
-The row convolution is called lookahead convolution.  This operator was 
+The row convolution is called lookahead convolution.  This operator was
 introduced in the following paper for DeepSpeech2:
-http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf
 
-The main motivation is that a bidirectional RNN, useful in DeepSpeech 
-like speech models, learns representation for a sequence by performing a 
-forward and a backward pass through the entire sequence. However, unlike 
+The main motivation is that a bidirectional RNN, useful in DeepSpeech
+like speech models, learns representation for a sequence by performing a
+forward and a backward pass through the entire sequence. However, unlike
 unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-and low-latency setting. The lookahead convolution incorporates information 
-from future subsequences in a computationally efficient manner to improve 
-unidirectional recurrent neural networks. The row convolution operator is 
+and low-latency setting. The lookahead convolution incorporates information
+from future subsequences in a computationally efficient manner to improve
+unidirectional recurrent neural networks. The row convolution operator is
 different from the 1D sequence convolution, and is computed as follows:
 
-Given an input sequence $X$ of length $t$ and input dimension $D$, 
+Given an input sequence $X$ of length $t$ and input dimension $D$,
 and a filter ($W$) of size $context \times D$,
 the output sequence is convolved as:
 
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 0d384eef8a02c..45fee045cbfd5 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -133,14 +133,14 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 RunProgram operator.
 
-The RunProgram operator receives a program's feed targets, fetch targets, 
-and parameters, and receives the forward and backward program desc 
+The RunProgram operator receives a program's feed targets, fetch targets,
+and parameters, and receives the forward and backward program desc
 as attributes, and then executes the program by executor.
 
-NOTE: This operator is added so that the inference model stored by 
-`fluid.io.save_inference_model` under the static graph mode can be loaded 
+NOTE: This operator is added so that the inference model stored by
+`fluid.io.save_inference_model` under the static graph mode can be loaded
 under the dynamic graph mode for fine-tuning or inferencing.
-      
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 6f23eeebafa54..ee9abf6f35400 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -102,7 +102,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   """
   Computes sampled output training logits and labels suitable for implementing
-  sampled softmax.        
+  sampled softmax.
   """
 
 )DOC");
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
index afe43a337bbf8..1beb06366ea91 100644
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ b/paddle/fluid/operators/searchsorted_op.cc
@@ -54,7 +54,7 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
   Searchsorted Operator.
 
   This OP is used to find the index of the corresponding sorted_sequence in the innermost dimension based on the given values.
- 
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/select_output_op.cc b/paddle/fluid/operators/select_output_op.cc
index 0cb7e058a68d8..ad9cbf22cf527 100644
--- a/paddle/fluid/operators/select_output_op.cc
+++ b/paddle/fluid/operators/select_output_op.cc
@@ -83,7 +83,7 @@ class SelectOutputOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     // (minimal viable product) here.
     AddComment(R"DOC(
 Split input variable into one output branch. The mask is an integer tensor to
-specify which output branch should copy the input. 
+specify which output branch should copy the input.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index de55f1ab52a35..337ea46b260e9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -59,10 +59,10 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
 Sequence Enumerate Operator.
 
 Generate a new sequence for the input index sequence, which enumerates all the
-sub-sequences with length `win_size` of the input. 
+sub-sequences with length `win_size` of the input.
 The enumerated sequence has the same 1st dimension with variable `input`, and
 the 2nd dimension is `win_size`, padded by `pad_value` if necessary in generation.
-    
+
 Examples:
 Case 1:
   Input:
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
index c64b568e533d0..2943b8895978f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc
@@ -62,17 +62,17 @@ class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Sequence Erase Operator.
 
-Sequence erase operator erases tokens specified by Attr(tokens) from the input 
-sequences Input(X), and outputs the remaining data and modifies the LoD 
+Sequence erase operator erases tokens specified by Attr(tokens) from the input
+sequences Input(X), and outputs the remaining data and modifies the LoD
 information at the same time. For example, given a 2-D LoDTensor
 
     X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
 
 with lod = [[0, 3, 6, 10]], there are three sequences in the input:
-   
+
      X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
 
-If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing
 operation, the three sequences become
 
     X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
@@ -83,8 +83,8 @@ Hence the LoDTensor Output(Out) should be
 
 with lod = [[0, 1, 3, 7]].
 
-An example usage for this operator is to remove the special tokens when 
-computing the edit distance between two strings, such as blank, start token, 
+An example usage for this operator is to remove the special tokens when
+computing the edit distance between two strings, such as blank, start token,
 and end token.
 )DOC");
   }
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
index 2ed9c44f5928c..8ea756e455e23 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc
@@ -85,7 +85,7 @@ This operator outputs a Mask according to Input(X) and Attr(maxlen).
 Supposing Input(X) is a Tensor with shape [d_1, d_2, ..., d_n], the
 Output(Y) is a mask with shape [d_1, d_2, ..., d_n, maxlen], where:
 
-Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n)) 
+Y(i_1, i_2, ..., i_n, j) = (j < X(i_1, i_2, ..., i_n))
 
 If maxlen < 0, maxlen = max(X)
     )DOC");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
index e5c84d45d55e9..d427e339fb9c3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc
@@ -170,9 +170,9 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       Sequence Pad Operator
 
-      This operator pads sequences in a same batch to a consistent length. 
-      The length is specified by attribute 'padded_length'. New elements, 
-      whose values are specified by input 'PadValue', will be appended to 
+      This operator pads sequences in a same batch to a consistent length.
+      The length is specified by attribute 'padded_length'. New elements,
+      whose values are specified by input 'PadValue', will be appended to
       the end of each sequence, to make their final lengths consistent.
 
       Following are cases to better explain how this works:
@@ -186,10 +186,10 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           PadValue.data = [0]
       and attribite 'padded_length' = 4,
       then we get LoDTensor:
-          Out.data = [[a, b, 0, 0], 
+          Out.data = [[a, b, 0, 0],
                       [c, d, e, 0]]
           Length.data = [2, 3]
-      
+
       Case 2:
 
       Given a 1-level LoDTensor input(X):
@@ -197,13 +197,13 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [0]
-      and attribite 'padded_length' = -1, which mean using the length 
+      and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
       then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [0, 0]], 
+          Out.data = [[[a1, a2], [b1, b2], [0, 0]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
- 
+
       Case 3:
 
       Given a 1-level LoDTensor input(X):
@@ -211,10 +211,10 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker {
           X.data = [[a1, a2], [b1, b2], [c1, c2], [d1, d2], [e1, e2]]
       and Input(PadValue):
           PadValue.data = [p1, p2]
-      and attribite 'padded_length' = -1, which mean using the length 
+      and attribite 'padded_length' = -1, which mean using the length
       of longest input sequence(3 in this case),
       then we get LoDTensor:
-          Out.data = [[[a1, a2], [b1, b2], [p1, p2]], 
+          Out.data = [[[a1, a2], [b1, b2], [p1, p2]],
                       [[c1, c2], [d1, d2], [e1, e2]]]
           Length.data = [2, 3]
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
index 613dc8bfbc9b1..fe91dd00d4f86 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc
@@ -108,8 +108,8 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
       Sequence Unpad Operator
 
-      This operator removes the padding data in the input sequences and convert 
-      them into sequences with actual length as output, identitied by lod 
+      This operator removes the padding data in the input sequences and convert
+      them into sequences with actual length as output, identitied by lod
       information.
 
       Example:
@@ -117,9 +117,9 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
       Given input tensor Input(X):
           X.data = [[ 1.0,  2.0,  3.0,  4.0,  5.0],
                     [ 6.0,  7.0,  8.0,  9.0, 10.0],
-                    [11.0, 12.0, 13.0, 14.0, 15.0]], 
-`     
-      in which there are 3 sequences padded to length 5, and the actual length 
+                    [11.0, 12.0, 13.0, 14.0, 15.0]],
+`
+      in which there are 3 sequences padded to length 5, and the actual length
       specified by Input(Length):
 
           Length.data = [2, 3, 4],
@@ -127,7 +127,7 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker {
       after unpadding, Output(Out) will be:
 
           Out.data = [[1.0, 2.0, 6.0, 7.0, 8.0, 11.0, 12.0, 13.0, 14.0]]
-          Out.lod = [[0, 2, 5, 9]]      
+          Out.lod = [[0, 2, 5, 9]]
 
     )DOC");
   }
diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc
index bd09ff9921b26..e601a50409936 100644
--- a/paddle/fluid/operators/shard_index_op.cc
+++ b/paddle/fluid/operators/shard_index_op.cc
@@ -55,10 +55,10 @@ class ShardIndexOpMaker : public framework::OpProtoAndCheckerMaker {
 This layer creates the sharded index for input. This layers is used in
 model- and data- parallel mixed training generally, in which the index
 data (usually the label) should be recaculated in each trainer according
-to 
+to
 
 .. math::
-    
+
     assert index_num % nshards == 0
 
     shard_size = index_num / nshards
@@ -76,13 +76,13 @@ the original index should be recalculated (i.e. sharded) before.
     X is a Tensor of integer values:
       X.shape = [4, 1]
       X.data = [[1], [6], [12], [19]]
-    
+
     suppose index_num = 20 and nshards = 2, then we get shard_size = 10
-    
+
     if shard_id == 0, we get the Out:
       Out.shape = [4, 1]
       Out.data = [[1], [6], [-1], [-1]]
-    
+
     if shard_id == 1, we get the Out:
       Out.shape = [4, 1]
       Out.data = [[-1], [-1], [2], [9]]
diff --git a/paddle/fluid/operators/similarity_focus_op.cc b/paddle/fluid/operators/similarity_focus_op.cc
index f390e99da66a9..5c5343bf4248c 100644
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
@@ -35,17 +35,17 @@ class SimilarityFocusOpMaker : public framework::OpProtoAndCheckerMaker {
 SimilarityFocus Operator.
 
 Generate a similarity focus mask with the same shape of input using the following method:
-1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
-   to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
-   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
+   to the axis according to the indexes. For example, if axis=1 and indexes=[a],
+   it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
    is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-2. For each index, find the largest numbers in the tensor T, so that the same 
-   row and same column has at most one number(what it means is that if the 
-   largest number has been found in the i-th row and the j-th column, then 
-   the numbers in the i-th row or j-th column will be skipped. And then the 
-   next largest number will be selected from the remaining numbers. Obviously 
-   there will be min(B, C) numbers), and mark the corresponding position of the 
-   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+2. For each index, find the largest numbers in the tensor T, so that the same
+   row and same column has at most one number(what it means is that if the
+   largest number has been found in the i-th row and the j-th column, then
+   the numbers in the i-th row or j-th column will be skipped. And then the
+   next largest number will be selected from the remaining numbers. Obviously
+   there will be min(B, C) numbers), and mark the corresponding position of the
+   3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
    each index.
 3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc
index 6f867c05e2d80..48dc3d782481d 100644
--- a/paddle/fluid/operators/sparse_attention_op.cc
+++ b/paddle/fluid/operators/sparse_attention_op.cc
@@ -66,8 +66,8 @@ class SparseAttentionOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
     AddComment(R"DOC(
       Compute the value of the sparse attention module. Its input value includes five tensors.
-      Q, K, and V represent query, key, and value in the Attention module, respectively. 
-      The CSR format is used to represent the sparsity feature in the Attention module. 
+      Q, K, and V represent query, key, and value in the Attention module, respectively.
+      The CSR format is used to represent the sparsity feature in the Attention module.
       The CSR format contains two tensors, offset and columns.
       )DOC");
   }
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 1d47a10d56bc2..19a846afd4376 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -114,7 +114,7 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker {
 
             $$\sigma{\mathbf{W}} = \mathbf{u}^{T} \mathbf{W} \mathbf{v}$$
 
-          For details of spectral normalization, please refer to paper: 
+          For details of spectral normalization, please refer to paper:
           `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
          )DOC");
   }
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 55d307cf087ec..dc1848b3ee124 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -140,15 +140,15 @@ class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 SquaredL2Distance operator
 
-This operator will cacluate the squared L2 distance for the input and 
-the target. Number of distance value will be equal to the first dimension 
-of input. First dimension of the target could be equal to the input or to 1. 
-If the first dimension of target is 1, the operator will broadcast target's 
-first dimension to input's first dimension. During backward propagation, 
-the user can decide whether to calculate the gradient of the input or 
+This operator will cacluate the squared L2 distance for the input and
+the target. Number of distance value will be equal to the first dimension
+of input. First dimension of the target could be equal to the input or to 1.
+If the first dimension of target is 1, the operator will broadcast target's
+first dimension to input's first dimension. During backward propagation,
+the user can decide whether to calculate the gradient of the input or
 the target or both.
 
-Both the input X and Y can carry the LoD (Level of Details) information. 
+Both the input X and Y can carry the LoD (Level of Details) information.
 However, the output only shares the LoD information with input X.
     )DOC");
   }
diff --git a/paddle/fluid/operators/tdm_child_op.cc b/paddle/fluid/operators/tdm_child_op.cc
index 1e98035039f85..c91f0b989e3ac 100644
--- a/paddle/fluid/operators/tdm_child_op.cc
+++ b/paddle/fluid/operators/tdm_child_op.cc
@@ -49,7 +49,7 @@ class TDMChildOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(2);
     AddComment(R"DOC("
      **Tdm Child**
-     According to the input node_id on the given tree, return the corresponding child node_id and 
+     According to the input node_id on the given tree, return the corresponding child node_id and
       whether child is a leaf node by LeafMask.")DOC");
   }
 };
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 37d9897f727b5..4525d431ff136 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -226,7 +226,7 @@ It's similarity to SigmoidCrossEntropyWithLogits Operator. The difference is tha
 we add another label(z') to original.
         loss = max(x, 0) - x * z + log(1 + exp(-abs(x))) + max(x, 0) - x * z' + log(1 + exp(-abs(x)))
         z is click or not
-        z' is teacher value 
+        z' is teacher value
         label = {-2, -1, [0, 2]}
         when z' is not exist, clk = 0 : label = -2;
         when z' is not exist, clk = 1 : label = -1;
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 82f14a2691046..ca446fcb97236 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -74,20 +74,20 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
           This operator calculates the temporal shifting features for Input(X).
 
-          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while 
-          N is the batch size, T is the temporal segment number specified by 
-          :attr:`seg_num`, C is the channel number, H and W is the height and 
+          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while
+          N is the batch size, T is the temporal segment number specified by
+          :attr:`seg_num`, C is the channel number, H and W is the height and
           width of features.
 
           Temporal Shifting is calculated as follows when data format is NCHW:
-          
+
           Step 1: Reshape Input(X) to [N, T, C, H, W].
 
-          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with 
-          padding width as 1 on each side, padding result will be in shape 
+          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with
+          padding width as 1 on each side, padding result will be in shape
           of [N, T+2, C, H, W].
 
-          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding 
+          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding
           result as follows:
 
           $$
@@ -100,10 +100,10 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
           slice3 = x[:, 1:T+1, C/2:, :, :]
           $$
 
-          Step 4: Concatenate three slices along the 3rd(C) dimension and 
+          Step 4: Concatenate three slices along the 3rd(C) dimension and
           reshape result to [N*T, C, H, W].
 
-          For details of temporal shifting, please refer to paper: 
+          For details of temporal shifting, please refer to paper:
           `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
 
          )DOC");
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 7b58a1bb7d6d2..eab919135c7bb 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -159,6 +159,8 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
 
   // Execute them.
   LOG(INFO) << "engine_op run";
+  inference::tensorrt::OpTeller::Global().SetOpConverterType(
+      "fc", inference::tensorrt::OpConverterType::Default);
   engine_op->Run(scope, place);
 }
 
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4b5630037487f..afc18010bb4df 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -90,8 +90,8 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Top K operator
 
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
+If the input is a vector (1d tensor), this operator finds the k largest
+entries in the vector and outputs their values and indices as vectors.
 Thus values[j] is the j-th largest entry in input, and its index is indices[j].
 
 For matrices, this operator computes the top k entries in each row. )DOC");
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index 2f915c959fdea..b1b68eb1ed713 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -51,8 +51,8 @@ class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Top K operator
 
-If the input is a vector (1d tensor), this operator finds the k largest 
-entries in the vector and outputs their values and indices as vectors. 
+If the input is a vector (1d tensor), this operator finds the k largest
+entries in the vector and outputs their values and indices as vectors.
 Thus values[j] is the j-th largest entry in input, and its index is indices[j].
 
 For matrices, this operator computes the top k entries in each row. )DOC");
diff --git a/paddle/fluid/operators/tril_indices_op.cc b/paddle/fluid/operators/tril_indices_op.cc
index c8123dfdf875b..bae34fa5f5635 100644
--- a/paddle/fluid/operators/tril_indices_op.cc
+++ b/paddle/fluid/operators/tril_indices_op.cc
@@ -61,9 +61,9 @@ class TrilIndicesOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
   TrilIndices Operator.
 
-  The tril_indices operator returns the indices of the lower triangular part of the matrix 
-  whose rows and cols is knowed. It is a 2-by-x tensor,where the first row contains row coordinates 
-  of all indices and the second row contains column coordinates. Indices are ordered based on 
+  The tril_indices operator returns the indices of the lower triangular part of the matrix
+  whose rows and cols is knowed. It is a 2-by-x tensor,where the first row contains row coordinates
+  of all indices and the second row contains column coordinates. Indices are ordered based on
   rows and then columns. The lower triangular part of the matrix is defined as the elements on
   and below the diagonal.
 
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index e81a734c16f82..5d2c3c0797acf 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -41,9 +41,9 @@ class TrilTriuOpMaker : public framework::OpProtoAndCheckerMaker {
 TrilTriu Operator.
 
 The tril operator returns the lower triangular part of the matrix (2-D tensor)
-or batch of matrices $input$. The lower triangular part of the matrix is defined 
+or batch of matrices $input$. The lower triangular part of the matrix is defined
 as the elements on and below the diagonal.
-The triu operator returns the upper triangular part of a matrix (2-D tensor) 
+The triu operator returns the upper triangular part of a matrix (2-D tensor)
 or batch of matrices $input$. The upper triangular part of the matrix is defined
 as the elements on and above the diagonal.
 The other elements of the result tensor out are set to 0.
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 8bb0b20402cbc..b8de9df202812 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -54,7 +54,7 @@ class UnfoldOpMaker : public framework::OpProtoAndCheckerMaker {
 This Operator is used to extract sliding local blocks from a batched input tensor, also known
 as im2col when operated on batched 2D image tensor. For each block under the convolution filter,
 all element will be rearranged as a column. While the convolution filter sliding over the input
-feature map, a series of such columns will be formed. 
+feature map, a series of such columns will be formed.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
index 3f5c24fe4bfaa..4d772e50e6525 100644
--- a/paddle/fluid/operators/unique_op.cc
+++ b/paddle/fluid/operators/unique_op.cc
@@ -153,9 +153,9 @@ class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(false);
     AddComment(R"DOC(
     1. Return a unique subsequence for 1-D input tensor, and an index tensor
-    pointing to this unique subsequence when Attr(is_sorted) is false. This 
+    pointing to this unique subsequence when Attr(is_sorted) is false. This
     means paddle.unique is called.
-    
+
     2. Returns the unique elements of X in ascending order when Attr(is_sorted)
     is true. This means fluid.layers.unique is called.
 )DOC");
diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc
index b86eb72e7d3be..6e60078f6ab48 100644
--- a/paddle/fluid/operators/unique_with_counts_op.cc
+++ b/paddle/fluid/operators/unique_with_counts_op.cc
@@ -64,7 +64,7 @@ class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker {
               "the attr `dtype`");
     AddOutput("Count", "A subsequence for the count of unique index");
     AddComment(R"DOC(
-    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence, 
+    Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence,
     and the subsequence for the count of unique index.
 )DOC");
   }
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index 3aec6b8356877..eb7421019bd81 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -51,9 +51,9 @@ void VarConv2dOpMaker::Make() {
   AddComment(R"DOC(
     Var Size Conv Operator
 
-    This operator calculate Out = \sigma \left ( W * X + b \right ), 
+    This operator calculate Out = \sigma \left ( W * X + b \right ),
     only support 2-D for X.
-    
+
     NOTE: only support 'float32' data type now.
 
   )DOC");
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 6ed27fd9b326d..c910e4b4ea0fb 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -251,6 +251,10 @@ if(WITH_MLU)
   target_link_libraries(device_context mlu_resource_pool)
 endif()
 
+if(WITH_CUSTOM_DEVICE)
+  target_link_libraries(device_context custom_device_resource_pool)
+endif()
+
 cc_test(
   init_test
   SRCS init_test.cc
@@ -284,11 +288,17 @@ if(WITH_GPU)
   set(DEVICE_EVENT_LIBS
       device_event_gpu
       CACHE INTERNAL "device event libs")
-  nv_test(
-    device_event_test
-    SRCS device_event_test.cc
-    DEPS device_event_gpu)
-
+  if(WITH_CUSTOM_DEVICE)
+    nv_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu device_event_custom_device)
+  else()
+    nv_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu)
+  endif()
   nv_test(
     device_context_test
     SRCS device_context_test.cu
@@ -311,11 +321,17 @@ if(WITH_ROCM)
   set(DEVICE_EVENT_LIBS
       device_event_gpu
       CACHE INTERNAL "device event libs")
-  hip_test(
-    device_event_test
-    SRCS device_event_test.cc
-    DEPS device_event_gpu)
-
+  if(WITH_CUSTOM_DEVICE)
+    hip_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu device_event_custom_device)
+  else()
+    hip_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu)
+  endif()
   hip_test(
     device_context_test
     SRCS device_context_test.cu
@@ -470,3 +486,13 @@ if(NOT APPLE AND NOT WIN32)
       DEPS device_code lod_tensor)
   endif()
 endif()
+
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    device_event_custom_device
+    SRCS device_event_custom_device.cc
+    DEPS device_event_base custom_device_resource_pool)
+  set(DEVICE_EVENT_LIBS
+      ${DEVICE_EVENT_LIBS} device_event_custom_device
+      CACHE INTERNAL "device event libs")
+endif()
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 62745883023cb..d01cb2288adaa 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -24,3 +24,7 @@ endif()
 if(WITH_MLU)
   add_subdirectory(mlu)
 endif()
+
+if(WITH_CUSTOM_DEVICE)
+  add_subdirectory(custom)
+endif()
diff --git a/paddle/fluid/platform/device/custom/CMakeLists.txt b/paddle/fluid/platform/device/custom/CMakeLists.txt
new file mode 100644
index 0000000000000..142c6c70e797d
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/CMakeLists.txt
@@ -0,0 +1,6 @@
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    custom_device_resource_pool
+    SRCS custom_device_resource_pool.cc
+    DEPS gflags glog enforce monitor)
+endif()
diff --git a/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc b/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc
new file mode 100644
index 0000000000000..1cd6c3bb3f745
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device_resource_pool.cc
@@ -0,0 +1,190 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/platform/device/custom/custom_device_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+CustomDeviceStreamResourcePool::CustomDeviceStreamResourcePool(
+    const paddle::Place& place) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_custom_place(place),
+      true,
+      platform::errors::PreconditionNotMet(
+          "Required device shall be CustomPlace, but received %d. ", place));
+
+  int dev_cnt = phi::DeviceManager::GetDeviceCount(place.GetDeviceType());
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [place, dev_idx] {
+      auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx);
+      phi::DeviceManager::SetDevice(place_);
+
+      phi::stream::Stream* stream = new phi::stream::Stream(place_, nullptr);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->CreateStream(stream);
+      return stream;
+    };
+
+    auto deleter = [place, dev_idx](phi::stream::Stream* stream) {
+      auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx);
+      phi::DeviceManager::SetDevice(place_);
+
+      phi::DeviceManager::GetDeviceWithPlace(place_)->DestroyStream(stream);
+      delete stream;
+    };
+
+    pool_.emplace_back(
+        ResourcePool<CustomDeviceStreamObject>::Create(creator, deleter));
+  }
+}
+
+CustomDeviceStreamResourcePool& CustomDeviceStreamResourcePool::Instance(
+    const paddle::Place& place) {
+  static std::unordered_map<
+      std::string,
+      std::vector<std::shared_ptr<CustomDeviceStreamResourcePool>>>
+      pool;
+  PADDLE_ENFORCE_EQ(
+      platform::is_custom_place(place),
+      true,
+      platform::errors::PreconditionNotMet(
+          "Required device shall be CustomPlace, but received %d. ", place));
+  if (pool.find(place.GetDeviceType()) == pool.end()) {
+    pool.insert(
+        {place.GetDeviceType(),
+         std::vector<std::shared_ptr<CustomDeviceStreamResourcePool>>()});
+    for (size_t i = 0;
+         i < phi::DeviceManager::GetDeviceCount(place.GetDeviceType());
+         ++i) {
+      pool[place.GetDeviceType()].emplace_back(
+          new CustomDeviceStreamResourcePool(
+              paddle::platform::CustomPlace(place.GetDeviceType(), i)));
+    }
+  }
+  PADDLE_ENFORCE_LT(
+      place.GetDeviceId(),
+      pool[place.GetDeviceType()].size(),
+      platform::errors::OutOfRange("Device id is out of range, device id shall "
+                                   "be less than %d, but received %d. ",
+                                   pool[place.GetDeviceType()].size(),
+                                   place.GetDeviceId()));
+  return *pool[place.GetDeviceType()][place.GetDeviceId()];
+}
+
+std::shared_ptr<CustomDeviceStreamObject> CustomDeviceStreamResourcePool::New(
+    int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx,
+      0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx,
+      pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(),
+          dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+CustomDeviceEventResourcePool::CustomDeviceEventResourcePool(
+    const paddle::Place& place) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_custom_place(place),
+      true,
+      platform::errors::PreconditionNotMet(
+          "Required device shall be CustomPlace, but received %d. ", place));
+
+  int dev_cnt = phi::DeviceManager::GetDeviceCount(place.GetDeviceType());
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [place, dev_idx] {
+      auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx);
+      phi::DeviceManager::SetDevice(place_);
+
+      phi::event::Event* event = new phi::event::Event(place_, nullptr);
+      phi::DeviceManager::GetDeviceWithPlace(place_)->CreateEvent(event);
+      return event;
+    };
+
+    auto deleter = [place, dev_idx](phi::event::Event* event) {
+      auto place_ = phi::CustomPlace(place.GetDeviceType(), dev_idx);
+      phi::DeviceManager::SetDevice(place_);
+
+      phi::DeviceManager::GetDeviceWithPlace(place_)->DestroyEvent(event);
+    };
+
+    pool_.emplace_back(
+        ResourcePool<CustomDeviceEventObject>::Create(creator, deleter));
+  }
+}
+
+CustomDeviceEventResourcePool& CustomDeviceEventResourcePool::Instance(
+    const phi::Place& place) {
+  static std::unordered_map<
+      std::string,
+      std::vector<std::shared_ptr<CustomDeviceEventResourcePool>>>
+      pool;
+  PADDLE_ENFORCE_EQ(
+      platform::is_custom_place(place),
+      true,
+      platform::errors::PreconditionNotMet(
+          "Required device shall be CustomPlace, but received %d. ", place));
+  if (pool.find(place.GetDeviceType()) == pool.end()) {
+    pool.insert(
+        {place.GetDeviceType(),
+         std::vector<std::shared_ptr<CustomDeviceEventResourcePool>>()});
+    for (size_t i = 0;
+         i < phi::DeviceManager::GetDeviceCount(place.GetDeviceType());
+         ++i) {
+      pool[place.GetDeviceType()].emplace_back(
+          new CustomDeviceEventResourcePool(
+              paddle::platform::CustomPlace(place.GetDeviceType(), i)));
+    }
+  }
+  PADDLE_ENFORCE_LT(
+      place.GetDeviceId(),
+      pool[place.GetDeviceType()].size(),
+      platform::errors::OutOfRange("Device id is out of range, device id shall "
+                                   "be less than %d, but received %d. ",
+                                   pool[place.GetDeviceType()].size(),
+                                   place.GetDeviceId()));
+  return *pool[place.GetDeviceType()][place.GetDeviceId()];
+}
+
+std::shared_ptr<CustomDeviceEventObject> CustomDeviceEventResourcePool::New(
+    int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx,
+      0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx,
+      pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(),
+          dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device/custom/custom_device_resource_pool.h b/paddle/fluid/platform/device/custom/custom_device_resource_pool.h
new file mode 100644
index 0000000000000..c643cff7b5451
--- /dev/null
+++ b/paddle/fluid/platform/device/custom/custom_device_resource_pool.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/resource_pool.h"
+#include "paddle/phi/backends/device_manager.h"
+
+namespace paddle {
+namespace platform {
+
+using CustomDeviceStreamObject = phi::stream::Stream;
+using CustomDeviceEventObject = phi::event::Event;
+
+class CustomDeviceStreamResourcePool {
+ public:
+  std::shared_ptr<CustomDeviceStreamObject> New(int dev_idx);
+
+  static CustomDeviceStreamResourcePool& Instance(const paddle::Place& place);
+
+ private:
+  explicit CustomDeviceStreamResourcePool(const paddle::Place& place);
+
+  DISABLE_COPY_AND_ASSIGN(CustomDeviceStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<CustomDeviceStreamObject>>> pool_;
+};
+
+class CustomDeviceEventResourcePool {
+ public:
+  std::shared_ptr<CustomDeviceEventObject> New(int dev_idx);
+
+  static CustomDeviceEventResourcePool& Instance(const paddle::Place& place);
+
+ private:
+  explicit CustomDeviceEventResourcePool(const paddle::Place& place);
+
+  DISABLE_COPY_AND_ASSIGN(CustomDeviceEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<CustomDeviceEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 7939f8ff7c066..d8ebb019fc691 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -130,7 +130,7 @@ constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
 constexpr DeviceType kIPU = DeviceType::IPU;
 constexpr DeviceType kMLU = DeviceType::MLU;
-constexpr DeviceType kCUSOTM_DEVICE = DeviceType::CUSTOM_DEVICE;
+constexpr DeviceType kCUSTOM_DEVICE = DeviceType::CUSTOM_DEVICE;
 
 using DeviceContext = phi::DeviceContext;
 
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index cf80266050af2..8659d8be902b6 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -25,6 +25,7 @@
 
 using ::paddle::platform::kCPU;
 using ::paddle::platform::kCUDA;
+using ::paddle::platform::kCUSTOM_DEVICE;
 using ::paddle::platform::kNPU;
 using ::paddle::platform::kXPU;
 
@@ -42,3 +43,9 @@ USE_EVENT(kNPU);
 USE_EVENT_WAIT(kNPU, kNPU)
 USE_EVENT_WAIT(kCPU, kNPU)
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+USE_EVENT(kCUSTOM_DEVICE);
+USE_EVENT_WAIT(kCUSTOM_DEVICE, kCUSTOM_DEVICE)
+USE_EVENT_WAIT(kCPU, kCUSTOM_DEVICE)
+#endif
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index 6a2948480b549..d0458dcb9e4e4 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -64,11 +64,13 @@ class DeviceEvent {
                           "Required type < %d, but received type = %d",
                           MaxDeviceTypes,
                           type_id_));
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
     // TODO(Aurelius84): only support CPU/CUDA/NPU.
     PADDLE_ENFORCE_LT(type_id_,
                       3,
                       platform::errors::Unavailable(
                           "Currently DeviceEvent do not support %s", place));
+#endif
     PADDLE_ENFORCE_NOT_NULL(
         event_creator_[type_id_],
         platform::errors::Unavailable(
diff --git a/paddle/fluid/platform/device_event_custom_device.cc b/paddle/fluid/platform/device_event_custom_device.cc
new file mode 100644
index 0000000000000..a45cb43baf2ec
--- /dev/null
+++ b/paddle/fluid/platform/device_event_custom_device.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+
+#include "paddle/fluid/platform/device/custom/custom_device_resource_pool.h"
+#include "paddle/fluid/platform/device_event_base.h"
+#include "paddle/fluid/platform/event.h"
+namespace paddle {
+namespace platform {
+struct CustomDeviceEventWrapper {
+  explicit CustomDeviceEventWrapper(const platform::Place& place) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_custom_place(place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Required device shall be CustomPlace, but received %d. ", place));
+
+    device_id_ = place.device;
+    PADDLE_ENFORCE_GT(
+        device_id_,
+        -1,
+        platform::errors::PreconditionNotMet(
+            "Required DeviceOption.device_id > -1, but received %d. ",
+            device_id_));
+    inner_event_ =
+        CustomDeviceEventResourcePool::Instance(place).New(device_id_);
+  }
+  std::shared_ptr<CustomDeviceEventObject> inner_event_;
+  int device_id_;
+};
+
+void DeviceEventCreateCustomDevice(DeviceEvent* event,
+                                   const platform::Place& place,
+                                   unsigned int) {
+  event->InitEvent(std::make_shared<CustomDeviceEventWrapper>(place));
+}
+
+void DeviceEventRecordCustomDevice(DeviceEvent* event,
+                                   const DeviceContext* context) {
+  auto* wrapper =
+      static_cast<CustomDeviceEventWrapper*>(event->GetEvent().get());
+  auto* custom_device_ctx =
+      dynamic_cast<const platform::CustomDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      custom_device_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+
+  phi::stream::Stream stream_wrapper(custom_device_ctx->GetPlace(),
+                                     custom_device_ctx->stream());
+  wrapper->inner_event_->Record(&stream_wrapper);
+}
+
+bool DeviceEventQueryCustomDevice(const DeviceEvent* event) {
+  auto* wrapper =
+      static_cast<CustomDeviceEventWrapper*>(event->GetEvent().get());
+  PADDLE_ENFORCE_NOT_NULL(
+      wrapper,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast event into CustomDeviceEventWrapper."));
+  return wrapper->inner_event_->Query();
+}
+
+void DeviceEventFinishCustomDevice(const DeviceEvent* event) {
+  auto* wrapper =
+      static_cast<CustomDeviceEventWrapper*>(event->GetEvent().get());
+  wrapper->inner_event_->Synchonrize();
+}
+
+void DeviceEventCustomDeviceWaitCustomDevice(const DeviceEvent* event,
+                                             const DeviceContext* context) {
+  auto* wrapper =
+      static_cast<CustomDeviceEventWrapper*>(event->GetEvent().get());
+  auto* custom_device_ctx =
+      dynamic_cast<const platform::CustomDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      custom_device_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  phi::stream::Stream stream_wrapper(custom_device_ctx->GetPlace(),
+                                     custom_device_ctx->stream());
+  stream_wrapper.WaitEvent(wrapper->inner_event_.get());
+}
+
+void DeviceEventCPUWaitCustomDevice(const DeviceEvent* event,
+                                    const DeviceContext* context) {
+  DeviceEventFinishCustomDevice(event);
+}
+
+void DeviceEventSetFinishedCustomDevice(const DeviceEvent* event) {
+  // do nothing
+}
+
+void EventResetCustomDevice(const DeviceEvent* event) {
+  // do nothing
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+using ::paddle::platform::kCPU;
+using ::paddle::platform::kCUSTOM_DEVICE;
+REGISTER_EVENT_CREATE_FUNCTION(kCUSTOM_DEVICE,
+                               paddle::platform::DeviceEventCreateCustomDevice)
+REGISTER_EVENT_RECORD_FUNCTION(kCUSTOM_DEVICE,
+                               paddle::platform::DeviceEventRecordCustomDevice)
+REGISTER_EVENT_QUERY_FUNCTION(kCUSTOM_DEVICE,
+                              paddle::platform::DeviceEventQueryCustomDevice)
+REGISTER_EVENT_FINISH_FUNCTION(kCUSTOM_DEVICE,
+                               paddle::platform::DeviceEventFinishCustomDevice)
+REGISTER_EVENT_SET_FINISHED_FUNCTION(
+    kCUSTOM_DEVICE, paddle::platform::DeviceEventSetFinishedCustomDevice)
+REGISTER_EVENT_WAIT_FUNCTION(
+    kCUSTOM_DEVICE,
+    kCUSTOM_DEVICE,
+    paddle::platform::DeviceEventCustomDeviceWaitCustomDevice)
+REGISTER_EVENT_WAIT_FUNCTION(kCPU,
+                             kCUSTOM_DEVICE,
+                             paddle::platform::DeviceEventCPUWaitCustomDevice)
+REGISTER_EVENT_RESET_FUNCTION(kCUSTOM_DEVICE,
+                              paddle::platform::EventResetCustomDevice)
+#endif
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index dd66ab179b26d..813171240da06 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -659,7 +659,11 @@ PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
  * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
  * message summary will be shown.
  */
+#ifdef PADDLE_NO_PYTHON
+static const int32_t kDefaultCallStackLevel = 2;
+#else
 static const int32_t kDefaultCallStackLevel = 1;
+#endif
 
 PADDLE_DEFINE_EXPORTED_int32(
     call_stack_level,
@@ -1011,6 +1015,7 @@ PADDLE_DEFINE_EXPORTED_bool(
  * Note:
  * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default
  * FLAGS_jit_engine_type == PE, using PEEngine by default
+ * FLAGS_jit_engine_type == New, using InterpreterEngine by default
  */
 PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
                               "PE",
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 933ac4f12e3c4..ca099cb65d67c 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -196,7 +196,8 @@ class MatMulV2MKLDNNHandler
       out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
     }
 
-    if (!IsInt8<OT>() && !IsBfloat16<OT>() && is_output_fused) {
+    // TODO(jczaja): Why not for int8??
+    if (!IsInt8<OT>() && is_output_fused) {
       out_strides = FakeTransposeStrides(out_ddims);
     }
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 287628c85e504..d2e097994fc0b 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -130,10 +130,10 @@ void ChromeTracingLogger::LogMemTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "[memory]", "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, 
-    "ph": "i", "cat": "%s", 
+    "ts": %lld,
+    "ph": "i", "cat": "%s",
     "args": {
       "place": "%s",
       "addr": "%llu",
@@ -196,10 +196,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -223,10 +223,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -263,10 +263,10 @@ void ChromeTracingLogger::LogHostTraceEventNode(
       output_file_stream_ << string_format(
           std::string(
               R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_runnable",
     "args": {
       "start_time": "%.3f us",
@@ -304,10 +304,10 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "thread_state_running",
     "args": {
       "correlation id": %d,
@@ -331,9 +331,9 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "s", "cat": "async"
   },
   )JSON"),
@@ -365,9 +365,9 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
   if (nsToUs(device_node.Duration()) == 0) {
     output_file_stream_ << string_format(std::string(
                                              R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "f", "cat": "async"
   },
   )JSON"),
@@ -381,9 +381,9 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
     output_file_stream_ << string_format(
         std::string(
             R"JSON(
-  { 
+  {
     "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
-    "ts": %lld, 
+    "ts": %lld,
     "ph": "f", "cat": "async", "bp": "e"
   },
   )JSON"),
@@ -410,10 +410,10 @@ void ChromeTracingLogger::HandleTypeKernel(
   output_file_stream_ << string_format(
       std::string(
           R"JSON(
-  { 
+  {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -476,7 +476,7 @@ void ChromeTracingLogger::HandleTypeMemcpy(
   {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -517,7 +517,7 @@ void ChromeTracingLogger::HandleTypeMemset(
   {
     "name": "%s[%s]", "pid": %lld, "tid": %lld,
     "ts": %lld, "dur": %.3f,
-    "ph": "X", "cat": "%s", 
+    "ph": "X", "cat": "%s",
     "cname": "cq_build_failed",
     "args": {
       "start_time": "%.3f us",
@@ -548,7 +548,7 @@ void ChromeTracingLogger::HandleTypeMemset(
 void ChromeTracingLogger::StartLog() {
   output_file_stream_ << std::string(
       R"JSON(
-  { 
+  {
     "displayTimeUnit": "ms",)JSON");
 }
 
@@ -717,49 +717,49 @@ void ChromeTracingLogger::RefineDisplayName(
             R"JSON(
   {
     "name": "process_name", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Process %lld (CPU)"
     }
   },
   {
     "name": "process_name", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Process %lld (CPU)"
     }
   },
    {
     "name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "thread %lld:%s(Python)"
     }
   },
   {
     "name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "thread %lld:%s(C++)"
     }
   },
   {
     "name": "process_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
   },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
@@ -803,32 +803,32 @@ void ChromeTracingLogger::RefineDisplayName(
                                              R"JSON(
   {
     "name": "process_name", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "Deivce %lld (%s)"
     }
   },
    {
     "name": "thread_name", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "name": "stream %lld"
     }
   },
   {
     "name": "process_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   {
     "name": "thread_sort_index", "pid": %lld, "tid": %lld,
-    "ph": "M", 
+    "ph": "M",
     "args": {
       "sort_index": %lld
     }
-  },  
+  },
   )JSON"),
                                          (*it).first,
                                          (*it).second,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 72885c0bbe5b7..a1231d7f55322 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -41,7 +41,8 @@ set(PYBIND_DEPS
     new_profiler
     auto_parallel
     jit_layer
-    jit_property)
+    jit_property
+    saved_tensors_hooks)
 
 if(WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
@@ -573,24 +574,24 @@ if(WITH_PYTHON)
   endif()
 
   cc_library(
-    paddle_pybind SHARED
+    libpaddle SHARED
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_dependencies(paddle_pybind legacy_eager_codegen)
-    add_dependencies(paddle_pybind eager_legacy_op_function_generator_cmd)
+    add_dependencies(libpaddle legacy_eager_codegen)
+    add_dependencies(libpaddle eager_legacy_op_function_generator_cmd)
   endif()
 
   if(NOT APPLE AND NOT WIN32)
-    target_link_libraries(paddle_pybind rt)
+    target_link_libraries(libpaddle rt)
   endif()
 
   if(WITH_ROCM)
-    target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
+    target_link_libraries(libpaddle ${ROCM_HIPRTC_LIB})
   endif()
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  target_link_libraries(paddle_pybind ${os_dependency_modules})
-  add_dependencies(paddle_pybind op_function_generator_cmd)
+  target_link_libraries(libpaddle ${os_dependency_modules})
+  add_dependencies(libpaddle op_function_generator_cmd)
 endif()
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 66cd20340ca85..65e759d3b2055 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -98,12 +98,12 @@ void BindCudaStream(py::module *m_ptr) {
       The handle of the CUDA stream.
 
       Parameters:
-        device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream. 
-        If device is None or negative integer, device will be the current device. 
-        If device is positive integer, it must less than the device count. Default: None. 
+        device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+        If device is None or negative integer, device will be the current device.
+        If device is positive integer, it must less than the device count. Default: None.
 
         priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
-        If priority is None, the priority is 2(normal). Default: None. 
+        If priority is None, the priority is 2(normal). Default: None.
 
       Examples:
         .. code-block:: python
@@ -126,7 +126,7 @@ void BindCudaStream(py::module *m_ptr) {
 
       Parameters:
         event(CUDAEvent): The event to wait on.
-      
+
       Examples:
         .. code-block:: python
 
@@ -149,7 +149,7 @@ void BindCudaStream(py::module *m_ptr) {
 
       Parameters:
         stream(CUDAStream): The stream to synchronize with.
-      
+
       Examples:
         .. code-block:: python
 
@@ -207,7 +207,7 @@ void BindCudaStream(py::module *m_ptr) {
       Parameters:
           event(CUDAEvent, optional): The event to be record. If event is None, a new event is created.
           Default: None.
-      
+
       Returns:
           The recored event.
 
@@ -238,7 +238,7 @@ void BindCudaStream(py::module *m_ptr) {
             import ctypes
             cuda_stream = paddle.device.cuda.current_stream().cuda_stream
             print(cuda_stream)
-            
+
             ptr = ctypes.c_void_p(cuda_stream)  # convert back to void*
             print(ptr)
 
@@ -322,7 +322,7 @@ void BindCudaStream(py::module *m_ptr) {
         enable_timing(bool, optional): Whether the event will measure time. Default: False.
         blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
         interprocess(bool, optional): Whether the event can be shared between processes. Defalut: False.
-      
+
       Examples:
         .. code-block:: python
 
@@ -345,7 +345,7 @@ void BindCudaStream(py::module *m_ptr) {
 
           Parameters:
             stream(CUDAStream, optional): The handle of CUDA stream. If None, the stream is the current stream. Default: None.
-          
+
           Examples:
             .. code-block:: python
 
@@ -353,7 +353,7 @@ void BindCudaStream(py::module *m_ptr) {
               import paddle
               event = paddle.device.cuda.Event()
               event.record()
-    
+
         )DOC",
           py::arg("stream") = nullptr)
       .def(
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 5a7e2355f64eb..8a434f42811a8 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -196,6 +196,23 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors, dst, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "send_partial",
               [](distributed::ProcessGroup &self,
@@ -217,6 +234,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int send_numel = numel / nranks;
+                int offset = send_numel * rank_id;
+                return self.Send_Partial(
+                    *dense, dst_rank, offset, send_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv",
               [](distributed::ProcessGroup &self,
@@ -232,6 +273,23 @@ void BindDistributed(py::module *m) {
               py::arg("src"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors, src, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv_partial",
               [](distributed::ProcessGroup &self,
@@ -253,6 +311,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int recv_numel = numel / nranks;
+                int offset = recv_numel * rank_id;
+                return self.Recv_Partial(
+                    *dense, src_rank, offset, recv_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "all_gather",
               [](distributed::ProcessGroup &self,
@@ -427,6 +509,94 @@ void BindDistributed(py::module *m) {
               },
               py::arg("tensor"),
               py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors,
+                                 dst,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int send_numel = numel / nranks;
+                int offset = send_numel * rank_id;
+                return self.Send_Partial(*dense,
+                                         dst_rank,
+                                         offset,
+                                         send_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors,
+                                 src,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int numel = (*dense).numel();
+                int recv_numel = numel / nranks;
+                int offset = recv_numel * rank_id;
+                return self.Recv_Partial(*dense,
+                                         src_rank,
+                                         offset,
+                                         recv_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
               py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h
index f617ead08e243..8a4a42b82a253 100644
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #include <Python.h>
 
+#include "paddle/fluid/eager/hooks.h"
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
@@ -28,6 +29,8 @@ typedef struct {
 
 typedef struct {
   PyObject_HEAD PyObject* container;
+  bool container_be_packed;
+  std::shared_ptr<egr::UnPackHookBase> unpack_hook;
   PyObject* non_differentiable;
   PyObject* not_inplace_tensors;
   bool materialize_grads;
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index 1bb8fdd936064..85afc274623ea 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -30,13 +30,13 @@ static PyObject *eager_api_linear(PyObject *self,
     auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
     tstate = PyEval_SaveThread();
     if (bias.initialized()) {
-      auto mm_out = matmul_dygraph_function(x, weight, false, false);
-      auto out = add_dygraph_function(mm_out, bias);
+      auto mm_out = matmul_ad_func(x, weight, false, false);
+      auto out = add_ad_func(mm_out, bias);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(out);
     } else {
-      auto mm_out = matmul_dygraph_function(x, weight, false, false);
+      auto mm_out = matmul_ad_func(x, weight, false, false);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(mm_out);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 16a5cff031d65..956d8e5814cc0 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -25,6 +25,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/backward.h"
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
+#include "paddle/fluid/eager/saved_tensors_hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/custom_operator.h"
@@ -591,6 +592,29 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self,
   return ToPyObject(tensor);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
+
+static PyObject* eager_api_register_saved_tensors_hooks(PyObject* self,
+                                                        PyObject* args,
+                                                        PyObject* kwargs) {
+  EAGER_TRY
+  if (egr::Controller::Instance().HasGrad()) {
+    auto pack_hook = PyTuple_GET_ITEM(args, 0);
+    auto unpack_hook = PyTuple_GET_ITEM(args, 1);
+    egr::SavedTensorsHooks::GetInstance().SetHooks(pack_hook, unpack_hook);
+  }
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_reset_saved_tensors_hooks(PyObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_TRY
+  egr::SavedTensorsHooks::GetInstance().ResetHooks();
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 #if defined(PADDLE_WITH_CUDA)
 static PyObject* eager_api_async_read(PyObject* self,
                                       PyObject* args,
@@ -965,6 +989,14 @@ PyMethodDef variable_functions[] = {
      (PyCFunction)(void (*)(void))eager_api_sparse_csr_tensor,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    {"register_saved_tensors_hooks",
+     (PyCFunction)(void (*)(void))eager_api_register_saved_tensors_hooks,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
+    {"reset_saved_tensors_hooks",
+     (PyCFunction)(void (*)(void))eager_api_reset_saved_tensors_hooks,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
 /**sparse functions**/
 #if defined(PADDLE_WITH_CUDA)
     {"async_read",
diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
index 7ed58a1e956f6..c599346bdb7a8 100644
--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -38,7 +38,7 @@ static PyObject *eager_api_run_program(PyObject *self,
         "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
 
     tstate = PyEval_SaveThread();
-    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    run_program_ad_func(X, Params, Out, OutScope, DOut, attrs);
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
     Py_RETURN_NONE;
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 7c2065b33db36..26326be70043a 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -808,14 +808,14 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
                                            decrease_axis.end());
 
     if (op_type == "slice") {
-      out = slice_dygraph_function(self->tensor,
-                                   slice_axes_tmp,
-                                   slice_starts,
-                                   slice_ends,
-                                   infer_flags_tmp,
-                                   decrease_axis_tmp);
+      out = slice_ad_func(self->tensor,
+                          slice_axes_tmp,
+                          slice_starts,
+                          slice_ends,
+                          infer_flags_tmp,
+                          decrease_axis_tmp);
     } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(
+      out = strided_slice_ad_func(
           self->tensor, slice_axes, slice_starts, slice_ends, slice_strides);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -854,7 +854,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
       }
 
       paddle::experimental::Tensor new_out;
-      new_out = unsqueeze_dygraph_function(out, none_axes);
+      new_out = unsqueeze_ad_func(out, none_axes);
       return ToPyObject(new_out);
     }
   }
@@ -870,7 +870,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     paddle::framework::TensorFromVector(
         list_select_idxs, *dev_ctx, idx_tensor.get());
     framework::AttributeMap attrs = {{"dim", 0}};
-    out = index_select_dygraph_function(self->tensor, select_index, 0);
+    out = index_select_ad_func(self->tensor, select_index, 0);
   }
 
   return ToPyObject(out);
diff --git a/paddle/fluid/pybind/eager_py_layer.cc b/paddle/fluid/pybind/eager_py_layer.cc
index 7e25b06e80a4d..f39dc6d74f4eb 100644
--- a/paddle/fluid/pybind/eager_py_layer.cc
+++ b/paddle/fluid/pybind/eager_py_layer.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/pylayer/py_layer_node.h"
+#include "paddle/fluid/eager/saved_tensors_hooks.h"
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -78,6 +79,7 @@ PyObject* PyLayerNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   if (obj) {
     auto v = reinterpret_cast<PyLayerObject*>(obj);
     v->materialize_grads = true;
+    v->container_be_packed = false;
     new (&v->grad_node) std::weak_ptr<egr::GradNodePyLayer>();
     new (&v->forward_input_tensor_is_duplicable) std::vector<bool>();
     new (&v->forward_output_tensor_is_duplicable) std::vector<bool>();
@@ -96,6 +98,7 @@ static void PyLayerDealloc(PyLayerObject* self) {
     Py_DECREF(self->not_inplace_tensors);
   }
   self->grad_node.~weak_ptr<egr::GradNodePyLayer>();
+  self->unpack_hook = nullptr;
   self->forward_input_tensor_is_duplicable.~vector();
   self->forward_output_tensor_is_duplicable.~vector();
   Py_TYPE(self)->tp_free(reinterpret_cast<PyObject*>(self));
@@ -455,23 +458,148 @@ PyObject* pylayer_method_apply(PyObject* cls,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* call_unpack_hook(PyLayerObject* self) {
+  auto unpack_hook = self->unpack_hook;
+  auto packed_value = self->container;
+
+  auto packed_value_size = PyTuple_GET_SIZE(packed_value);
+  auto unpacked_value = PyTuple_New(packed_value_size);
+
+  for (Py_ssize_t i = 0; i < packed_value_size; i++) {
+    PyObject* obj = PyTuple_GET_ITEM(packed_value, i);
+    if (PyList_Check(obj)) {
+      Py_ssize_t len = PyList_Size(obj);
+      auto tmp_list = PyList_New(len);
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyList_GetItem(obj, j);
+        PyTuple_SET_ITEM(tmp_list,
+                         j,
+                         reinterpret_cast<PyObject*>(((*unpack_hook)(
+                             reinterpret_cast<void*>(o), nullptr))));
+      }
+      PyTuple_SET_ITEM(unpacked_value, i, tmp_list);
+    } else if (PyTuple_Check(obj)) {
+      Py_ssize_t len = PyTuple_Size(obj);
+      auto tmp_tuple = PyTuple_New(len);
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyTuple_GetItem(obj, j);
+        PyTuple_SET_ITEM(tmp_tuple,
+                         j,
+                         reinterpret_cast<PyObject*>((*unpack_hook)(
+                             reinterpret_cast<void*>(o), nullptr)));
+      }
+      PyTuple_SET_ITEM(unpacked_value, i, tmp_tuple);
+    } else {
+      PyTuple_SET_ITEM(unpacked_value,
+                       i,
+                       reinterpret_cast<PyObject*>((*unpack_hook)(
+                           reinterpret_cast<void*>(obj), nullptr)));
+    }
+  }
+
+  return unpacked_value;
+}
+
 PyObject* tensor_properties_get_container(PyLayerObject* self, void* closure) {
   EAGER_TRY
   if (self->container == nullptr) {
     RETURN_PY_NONE;
   }
-  Py_INCREF(self->container);
-  return self->container;
+  if (self->container_be_packed) {
+    return call_unpack_hook(self);
+  } else {
+    Py_INCREF(self->container);
+    return self->container;
+  }
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+void call_pack_hook(PyLayerObject* self, PyObject* value) {
+  PyObject* saved_value = nullptr;
+  if (PyTuple_Check(value)) {
+    saved_value = value;
+  } else if (PyList_Check(value)) {
+    saved_value = PyList_AsTuple(value);
+  } else {
+    saved_value = PyTuple_New(1);
+    Py_INCREF(value);
+    PyTuple_SET_ITEM(saved_value, 0, value);
+  }
+
+  auto pack_hook = egr::SavedTensorsHooks::GetInstance().GetPackHook();
+  self->unpack_hook = egr::SavedTensorsHooks::GetInstance().GetUnPackHook();
+
+  auto saved_value_size = PyTuple_GET_SIZE(saved_value);
+  PyObject* packed_value = PyTuple_New(saved_value_size);
+
+  for (Py_ssize_t i = 0; i < saved_value_size; i++) {
+    PyObject* obj = PyTuple_GET_ITEM(saved_value, i);
+    if (IsEagerTensor(obj)) {
+      PyTuple_SET_ITEM(packed_value,
+                       i,
+                       reinterpret_cast<PyObject*>(
+                           (*pack_hook)(reinterpret_cast<void*>(obj))));
+    } else if (PyList_Check(obj)) {
+      Py_ssize_t len = PyList_Size(obj);
+      auto tmp_list = PyList_New(len);
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyList_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          PyTuple_SET_ITEM(tmp_list,
+                           j,
+                           reinterpret_cast<PyObject*>(
+                               (*pack_hook)(reinterpret_cast<void*>(o))));
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "save_for_backward only support Tensor, list of Tensor, tuple of "
+              "Tensor."));
+        }
+      }
+      PyTuple_SET_ITEM(packed_value, i, tmp_list);
+    } else if (PyTuple_Check(obj)) {
+      Py_ssize_t len = PyTuple_Size(obj);
+      auto tmp_tuple = PyTuple_New(len);
+      for (Py_ssize_t j = 0; j < len; j++) {
+        PyObject* o = PyTuple_GetItem(obj, j);
+        if (IsEagerTensor(o)) {
+          PyTuple_SET_ITEM(tmp_tuple,
+                           j,
+                           reinterpret_cast<PyObject*>(
+                               (*pack_hook)(reinterpret_cast<void*>(o))));
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "save_for_backward only support Tensor, list of Tensor, tuple of "
+              "Tensor."));
+        }
+      }
+      PyTuple_SET_ITEM(packed_value, i, tmp_tuple);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "save_for_backward only support Tensor, list of Tensor, tuple of "
+          "Tensor."));
+    }
+  }
+
+  if (PyTuple_Check(value)) {
+    Py_XDECREF(saved_value);
+  }
+
+  Py_XDECREF(self->container);
+  self->container = packed_value;
+  self->container_be_packed = true;
+}
+
 int tensor_properties_set_container(PyLayerObject* self,
                                     PyObject* value,
                                     void* closure) {
   EAGER_TRY
-  Py_XINCREF(value);
-  Py_XDECREF(self->container);
-  self->container = value;
+  if (egr::SavedTensorsHooks::GetInstance().IsEnable()) {
+    call_pack_hook(self, value);
+  } else {
+    Py_XINCREF(value);
+    Py_XDECREF(self->container);
+    self->container = value;
+  }
   return 0;
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index aeaa0dbff7816..3dc87f0f7cc04 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1231,7 +1231,7 @@ void BindImperative(py::module *m_ptr) {
           },
           R"DOC(
         Returns a numpy array shows the value of current Tensor.
-        
+
         Returns:
             ndarray: The numpy value of current Tensor.
 
@@ -1348,10 +1348,10 @@ void BindImperative(py::module *m_ptr) {
                 # Due to sharing of data with origin Tensor, There are some unsafe operations:
                 y = 2 * x
                 detach_x[:] = 5.0
-                y.backward() 
+                y.backward()
                 # It will raise Error:
                 #   one of the variables needed for gradient computation has been modified by an inplace operation.
-             
+
        )DOC")
       .def("clear_gradient",
            &imperative::VarBase::ClearGradient,
@@ -1618,7 +1618,7 @@ void BindImperative(py::module *m_ptr) {
               import paddle
               x = paddle.to_tensor(1.0, place=paddle.CUDAPlace(0))
               print(x.place)    # CUDAPlace(0)
-              
+
               y = x.cpu()
               print(y.place)    # CPUPlace
 
@@ -1708,12 +1708,12 @@ void BindImperative(py::module *m_ptr) {
           R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
-        If this Tensor is already in GPU memory and device_id is default, 
+        If this Tensor is already in GPU memory and device_id is default,
         then no copy is performed and the original Tensor is returned.
-        
+
         Args:
             device_id(int, optional): The destination GPU device id. Default: None, means current device.
-            blocking(bool, optional): If False and the source is in pinned memory, the copy will be 
+            blocking(bool, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. Default: False.
 
         Examples:
@@ -1726,7 +1726,7 @@ void BindImperative(py::module *m_ptr) {
 
               y = x.cuda()
               print(y.place)        # Place(gpu:0)
-            
+
               y = x.cuda(None)
               print(y.place)        # Place(gpu:0)
 
@@ -2011,7 +2011,7 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("element_size", &imperative::VarBase::ElementSize, R"DOC(
         Returns the size in bytes of an element in the Tensor.
-        
+
         Examples:
           .. code-block:: python
 
@@ -2076,8 +2076,8 @@ void BindImperative(py::module *m_ptr) {
                              R"DOC(
       Whether a Tensor is leaf Tensor.
 
-      For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor. 
-      
+      For the Tensor whose stop_gradient is ``True`` , it will be leaf Tensor.
+
       For the Tensor whose stop_gradient is ``False`` , it will be leaf Tensor too if it is created by user.
 
       Returns:
@@ -2721,7 +2721,7 @@ void BindImperative(py::module *m_ptr) {
 
   Returns:
 
-      new_tensor(paddle.Tensor): Return the UVA Tensor with the sample dtype and 
+      new_tensor(paddle.Tensor): Return the UVA Tensor with the sample dtype and
                                  shape with the input numpy array.
 
   Examples:
@@ -2730,7 +2730,7 @@ void BindImperative(py::module *m_ptr) {
         # required: gpu
         import numpy as np
         import paddle
-        
+
         data = np.random.randint(10, size=(3, 4))
         tensor = paddle.fluid.core.to_uva_tensor(data)
         print(tensor)
@@ -2834,38 +2834,38 @@ void BindImperative(py::module *m_ptr) {
         }
       },
       R"DOC(
-  This api provides a way to write pieces of source tensor to destination tensor 
-  inplacely and asynchronously. In which, we use `offset` and `count` to determine 
-  where to copy. `offset` means the begin points of the copy pieces of `src`, and 
-  `count` means the lengths of the copy pieces of `src`. To be noted, the copy process 
-  will run asynchronously from cuda to pin memory. We can simply remember this as 
+  This api provides a way to write pieces of source tensor to destination tensor
+  inplacely and asynchronously. In which, we use `offset` and `count` to determine
+  where to copy. `offset` means the begin points of the copy pieces of `src`, and
+  `count` means the lengths of the copy pieces of `src`. To be noted, the copy process
+  will run asynchronously from cuda to pin memory. We can simply remember this as
   "gpu async_write to pin_memory".
-  
+
   Arguments:
-  
-    src (Tensor): The source tensor, and the data type should be `float32` currently. 
+
+    src (Tensor): The source tensor, and the data type should be `float32` currently.
                   Besides, `src` should be placed on CUDAPlace.
 
-    dst (Tensor): The destination tensor, and the data type should be `float32` currently. 
-                  Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst` 
-                  should be the same with `src` except for the first dimension. 
+    dst (Tensor): The destination tensor, and the data type should be `float32` currently.
+                  Besides, `dst` should be placed on CUDAPinnedPlace. The shape of `dst`
+                  should be the same with `src` except for the first dimension.
 
-    offset (Tensor): The offset tensor, and the data type should be `int64` currently. 
-                     Besides, `offset` should be placed on CPUPlace. The shape of `offset` 
-                     should be one-dimensional. 
-    
-    count (Tensor): The count tensor, and the data type should be `int64` currently. 
-                    Besides, `count` should be placed on CPUPlace. The shape of `count` 
-                    should be one-dimensinal. 
+    offset (Tensor): The offset tensor, and the data type should be `int64` currently.
+                     Besides, `offset` should be placed on CPUPlace. The shape of `offset`
+                     should be one-dimensional.
+
+    count (Tensor): The count tensor, and the data type should be `int64` currently.
+                    Besides, `count` should be placed on CPUPlace. The shape of `count`
+                    should be one-dimensinal.
 
   Examples:
       .. code-block:: python
 
           import numpy as np
           import paddle
-          from paddle.fluid import core  
+          from paddle.fluid import core
           from paddle.device import cuda
-          
+
           if core.is_compiled_with_cuda():
               src = paddle.rand(shape=[100, 50, 50])
               dst = paddle.emtpy(shape=[200, 50, 50]).pin_memory()
@@ -3058,38 +3058,38 @@ void BindImperative(py::module *m_ptr) {
                         stream);
       },
       R"DOC(
-  This api provides a way to read from pieces of source tensor to destination tensor 
-  asynchronously. In which, we use `index`, `offset` and `count` to determine where 
-  to read. `index` means the index position of src tensor we want to read. `offset` 
-  and count means the begin points and length of pieces of src tensor we want to read. 
-  To be noted, the copy process will run asynchronously from pin memory to cuda place. 
+  This api provides a way to read from pieces of source tensor to destination tensor
+  asynchronously. In which, we use `index`, `offset` and `count` to determine where
+  to read. `index` means the index position of src tensor we want to read. `offset`
+  and count means the begin points and length of pieces of src tensor we want to read.
+  To be noted, the copy process will run asynchronously from pin memory to cuda place.
   We can simply remember this as "cuda async_read from pin_memory".
 
   Arguments:
-  
-    src (Tensor): The source tensor, and the data type should be `float32` currently. 
+
+    src (Tensor): The source tensor, and the data type should be `float32` currently.
                   Besides, `src` should be placed on CUDAPinnedPlace.
-  
-    dst (Tensor): The destination tensor, and the data type should be `float32` currently. 
-                  Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should 
+
+    dst (Tensor): The destination tensor, and the data type should be `float32` currently.
+                  Besides, `dst` should be placed on CUDAPlace. The shape of `dst` should
                   be the same with `src` except for the first dimension.
 
-    index (Tensor): The index tensor, and the data type should be `int64` currently. 
-                    Besides, `index` should be on CPUplace. The shape of `index` should 
+    index (Tensor): The index tensor, and the data type should be `int64` currently.
+                    Besides, `index` should be on CPUplace. The shape of `index` should
                     be one-dimensional.
 
-    buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily. 
-                     The data type should be `float32` currently, and should be placed 
+    buffer (Tensor): The buffer tensor, used to buffer index copy tensor temporarily.
+                     The data type should be `float32` currently, and should be placed
                      on CUDAPinnedPlace. The shape of `buffer` should be the same with `src` except for the first dimension.
 
-    offset (Tensor): The offset tensor, and the data type should be `int64` currently. 
-                     Besides, `offset` should be placed on CPUPlace. The shape of `offset` 
+    offset (Tensor): The offset tensor, and the data type should be `int64` currently.
+                     Besides, `offset` should be placed on CPUPlace. The shape of `offset`
                      should be one-dimensional.
 
-    count (Tensor): The count tensor, and the data type should be `int64` currently. 
-                    Besides, `count` should be placed on CPUPlace. The shape of `count` 
+    count (Tensor): The count tensor, and the data type should be `int64` currently.
+                    Besides, `count` should be placed on CPUPlace. The shape of `count`
                     should be one-dimensinal.
-    
+
   Examples:
       .. code-block:: python
 
@@ -3108,11 +3108,11 @@ void BindImperative(py::module *m_ptr) {
               buffer = paddle.empty(shape=[50, 50, 50], dtype="float32").pin_memory()
               index = paddle.to_tensor(
                   np.array([1, 3, 5, 7, 9], dtype="int64")).cpu()
-          
+
               stream = cuda.Stream()
               with cuda.stream_guard(stream):
                   core.async_read(src, dst, index, buffer, offset, count)
- 
+
 )DOC");
 #endif
 }
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 0b44dc5d2a2ca..02be0e9693ab7 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -296,9 +296,9 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 Default 100.
 
                 .. note::
-                    1. If you fetch data when calling the 'run', the ParallelExecutor 
-                    will clean up the temp variables at the end of the current iteration. 
-                    2. In some NLP model, it may cause the GPU memory is insufficient, 
+                    1. If you fetch data when calling the 'run', the ParallelExecutor
+                    will clean up the temp variables at the end of the current iteration.
+                    2. In some NLP model, it may cause the GPU memory is insufficient,
                     in this case, you should reduce `num_iteration_per_drop_scope`.
 
                 Examples:
@@ -859,7 +859,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                 synchronous batch normalization which synchronizes the mean
                 and variance through multi-devices in training phase.
                 Current implementation doesn't support FP16 training and CPU.
-                And only synchronous on one machine, not all machines. 
+                And only synchronous on one machine, not all machines.
                 Default is False.
 
                 Examples:
@@ -897,9 +897,9 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
           R"DOC((bool, optional): memory opitimize aims to save total memory
                 consumption, set to True to enable it.
 
-                Default None. None means framework would choose to use or not use 
-                this strategy automatically. Currently, None means that it is 
-                enabled when GC is disabled, and disabled when GC is enabled. 
+                Default None. None means framework would choose to use or not use
+                this strategy automatically. Currently, None means that it is
+                enabled when GC is disabled, and disabled when GC is enabled.
                 True means enabling and False means disabling. Default is None.
 
                 Examples:
@@ -912,7 +912,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
 
                         build_strategy = static.BuildStrategy()
                         build_strategy.memory_optimize = True
-                
+
                 )DOC")
       .def_property(
           "is_distribution",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0044f037fc017..b19cc07611bdb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -205,6 +205,14 @@ PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
 PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
+bool IsCompiledWithAVX() {
+#ifndef PADDLE_WITH_AVX
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   return false;
@@ -576,12 +584,7 @@ static int GetNCCLVersion() {
 }
 #endif
 
-#ifdef PADDLE_WITH_AVX
-PYBIND11_MODULE(core_avx, m) {
-#else
-PYBIND11_MODULE(core_noavx, m) {
-#endif
-
+PYBIND11_MODULE(libpaddle, m) {
   BindImperative(&m);
   BindEager(&m);
   BindEagerStringTensor(&m);
@@ -1038,7 +1041,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::arg("name"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found. 
+           its parent scope. Return None if not found.
 
            Args:
                name (str): the variable name.
@@ -1053,7 +1056,7 @@ All parameter, weight, gradient are variables in Paddle.
            py::arg("names"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
-           its parent scope. Return None if not found. 
+           its parent scope. Return None if not found.
 
            Args:
                name (str): the variable names to be erase.
@@ -1248,12 +1251,12 @@ All parameter, weight, gradient are variables in Paddle.
       R"DOC(
              Prune the backward part of a program, mostly called in
              program.clone(for_test=True).
-              
+
             Args:
                    program (ProgramDesc): The original program.
 
              Returns:
-                   tuple(ProgramDesc, map<int, int>): The first part is 
+                   tuple(ProgramDesc, map<int, int>): The first part is
                    the pruned program desc, and the second part is a map
                    which contains the id pair of pruned block and corresponding
                    origin block.
@@ -1706,6 +1709,7 @@ All parameter, weight, gradient are variables in Paddle.
   });
   m.def("init_default_kernel_signatures",
         []() { framework::InitDefaultKernelSignatureMap(); });
+  m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
@@ -1873,7 +1877,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::arg("tensor"),
           R"DOC(
              Append a LoDensor to LoDTensorArray.
-              
+
              Args:
                    tensor (LoDTensor): The LoDTensor to be appended.
 
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 8396a970bdd4f..8152a11c8193a 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -430,7 +430,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
            py::arg("zero_copy") = false,
            R"DOC(
         Set the data of Tensor on place with given numpy array.
-        
+
         Args:
           lod (numpy.ndarray): The data to set.
           place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
@@ -613,7 +613,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
            Args:
                 recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths.
-           
+
            Returns:
                 None.
 
@@ -644,7 +644,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
            Returns:
                list[list[int]]: The lod of the Tensor.
-           
+
            Examples:
                .. code-block:: python
 
@@ -668,7 +668,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
             return new_lod;
           },
           R"DOC(
-           Return the recursive sequence lengths corresponding to of the LodD 
+           Return the recursive sequence lengths corresponding to of the LodD
            of the Tensor.
 
            Returns:
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index b7ce9900b6f60..ec268a529adb3 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -3,6 +3,12 @@
   extra :
     attrs : [bool use_cudnn = false, bool use_mkldnn = false]
 
+- op : add (elementwise_add)
+  backward : add_grad (elementwise_add_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : acosh
   backward : acosh_grad
   extra :
@@ -193,6 +199,12 @@
   outputs :
     out : Out
 
+- op : divide (elementwise_div)
+  backward : divide_grad (elementwise_div)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : dot
   inputs :
     {x : X, y : Y}
@@ -209,6 +221,12 @@
   extra :
     attrs : [bool fix_seed = false, int seed = 0]
 
+- op : elementwise_pow
+  backward : elementwise_pow_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : elu
   backward : elu_grad
   extra :
@@ -248,6 +266,23 @@
   inputs: {x: X}
   outputs: {out: Out}
 
+- op : floor_divide (elementwise_floordiv)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : fmax (elementwise_fmax)
+  backward : fmax_grad (elementwise_fmax_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : fmin (elementwise_fmin)
+  backward : fmin_grad (elementwise_fmin_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : floor
   backward : floor_grad
   extra :
@@ -263,6 +298,11 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_cudnn = false]
 
+- op : grad_add
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : grid_sampler
   backward : grid_sampler_grad
   extra :
@@ -273,6 +313,12 @@
   extra :
     attrs : [bool is_test = false]
 
+- op : heaviside (elementwise_heaviside)
+  backward : heaviside_grad (elementwise_heaviside_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : hard_swish
   backward : hard_swish_grad
   extra :
@@ -346,6 +392,24 @@
              str mkldnn_data_type = "float32", 'int[] fused_reshape_X = {}', 'int[] fused_reshape_Y = {}',
              'int[] fused_transpose_X = {}', 'int[] fused_transpose_Y = {}',]
 
+- op : maximum (elementwise_max)
+  backward : maximum_grad (elementwise_max_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : maximum (elementwise_min)
+  backward : maximum_grad (elementwise_min_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : multiply (elementwise_mul)
+  backward : multiply_grad (elementwise_mul_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : mish
   backward : mish_grad
   extra :
@@ -436,6 +500,11 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : remainder (elementwise_mod)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : relu
   backward : relu_grad
   extra :
@@ -546,6 +615,12 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : subtract (elementwise_sub)
+  backward : subtract_grad (elementwise_sub_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
 - op : swish
   backward : swish_grad
   extra :
diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h
index a0d95349196e0..7030777474d5a 100644
--- a/paddle/phi/backends/device_base.h
+++ b/paddle/phi/backends/device_base.h
@@ -96,9 +96,10 @@ class DeviceInterface {  // Driver / Runtime
 
   // Event
   // ! Create an event.
-  virtual void CreateEvent(size_t dev_id,
-                           event::Event* event,
-                           event::Event::Flag flags);
+  virtual void CreateEvent(
+      size_t dev_id,
+      event::Event* event,
+      event::Event::Flag flags = event::Event::Flag::Default);
 
   // ! Destroy an event.
   virtual void DestroyEvent(size_t dev_id, event::Event* event);
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 54bafd796df46..130f8fab449ac 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -55,7 +55,8 @@ class Device final {
 
   // Event
   // ! Create an event.
-  void CreateEvent(event::Event* event, event::Event::Flag flags);
+  void CreateEvent(event::Event* event,
+                   event::Event::Flag flags = event::Event::Flag::Default);
 
   // ! Destroy an event.
   void DestroyEvent(event::Event* event);
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 8da2623bb2c2d..251916d8c1a15 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -425,7 +425,7 @@ struct EnforceNotMet : public std::exception {
  *     __ROLE: (string), Input or Output
  *     __NAME: (string), Input or Output name
  *     __OP_TYPE: (string), the op type
- *  
+ *
  * Return: The data pointed to by the pointer.
  *
  * Examples:
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 40dfb76586189..952ad90fc0686 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -213,9 +213,10 @@ struct DimensionsTransform {
         }
       }
     };
-    int swap_idx = 0;
-    bool has_seq_one = FindSequentialOneDim(&swap_idx);
-    if (has_seq_one) {
+    for (auto i = 0; i < dim_size; ++i) {
+      int swap_idx = 0;
+      bool has_seq_one = FindSequentialOneDim(&swap_idx);
+      if (!has_seq_one) break;
       merge_ptr = merge_sequential_one_dims;
       MergeDimensions<MergeFunctor>(merge_ptr, N);
       std::swap(in_dims[swap_idx], in_dims[0]);
@@ -508,7 +509,6 @@ void BroadcastKernelForDifferentVecSize(
                                    "functions is %d.",
                                    outs->size(),
                                    NumOuts));
-
   // mergedim and get vec_size
   const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
   phi::Array<kps::details::BroadcastConfig, kArity> configs;
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
index 9b09d897eff88..9d2d0bf3b5c88 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -83,7 +83,9 @@ void innerTransDataLayoutFromOneDNN(DataLayout in_layout,
   out->set_mem_desc(out_mem_desc);
   out->Resize(in.dims());
 
-  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
+  // Note(0x45f): Using initialized() to support slice Tensors
+  // with shapes like [0, 0, 0].
+  if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) {
     void* in_data = GetDataFromTensor(in, in_type);
 
     ReorderOneDNNHandler handler(in_tz, in.dtype(), in_type, cpu_engine);
diff --git a/paddle/phi/kernels/gpu/cast_impl.h b/paddle/phi/kernels/gpu/cast_impl.h
index f73d396572541..c209705a7418f 100644
--- a/paddle/phi/kernels/gpu/cast_impl.h
+++ b/paddle/phi/kernels/gpu/cast_impl.h
@@ -20,7 +20,7 @@
 namespace phi {
 
 template <typename InT, typename OutT>
-struct CastFuctor {
+struct CastFunctor {
   __device__ __forceinline__ OutT operator()(const InT x) const {
     return static_cast<OutT>(x);
   }
@@ -36,7 +36,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
   outputs.emplace_back(out);
   dev_ctx.Alloc<OutT>(out);
   phi::funcs::ElementwiseKernel<OutT>(
-      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
+      dev_ctx, inputs, &outputs, CastFunctor<InT, OutT>());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index f11fd0191b935..6694216214c31 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -23,6 +23,8 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/embedding_util.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace phi {
 
 template <typename InT, typename OutT>
@@ -101,6 +103,11 @@ struct EmbeddingGradCUDAFunctor {
       const int gridx = 2 * dev_ctx_.GetSMCount();
       dim3 threads(128, 8);
       dim3 grids(gridx, 1);
+
+      if (FLAGS_cudnn_deterministic) {
+        VLOG(2) << "Run grad kernel of embedding with single thread.";
+        grids.x = 1;
+      }
       EmbeddingGrad<T, IdT><<<grids, threads, 0, dev_ctx_.stream()>>>(
           d_table, d_output, ids, N, K, D);
     }
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index d982971029091..6bcc3d6ff4e29 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -13,126 +13,123 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/flip_kernel.h"
-
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/utils/array.h"
 
 namespace phi {
 
-template <typename T>
+template <typename T, size_t Rank>
 __global__ void flip_cuda_kernel(const int N,
                                  const T* in_data,
                                  T* out_data,
-                                 int64_t* x_shape,
-                                 int64_t* x_stride,
-                                 int* flip_dims,
-                                 int flip_dims_size,
-                                 int total_dims) {
+                                 phi::Array<int64_t, Rank> shape,
+                                 phi::Array<int64_t, Rank> stride,
+                                 phi::Array<int, Rank> flip_dims,
+                                 int flip_dims_size) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= N) {
     return;
   }
 
   int cur_indices = idx, rem = 0, dst_offset = 0;
-  for (int i = 0; i < total_dims; ++i) {
+  for (int i = 0; i < Rank; ++i) {
     int64_t temp = cur_indices;
-    cur_indices = cur_indices / x_stride[i];
-    rem = temp - cur_indices * x_stride[i];
+    cur_indices = cur_indices / stride[i];
+    rem = temp - cur_indices * stride[i];
     // flip the indices if it is in flip_dims
     for (int j = 0; j < flip_dims_size; ++j) {
       if (i == flip_dims[j]) {
-        cur_indices = x_shape[i] - 1 - cur_indices;
+        cur_indices = shape[i] - 1 - cur_indices;
       }
     }
-    dst_offset += cur_indices * x_stride[i];
+    dst_offset += cur_indices * stride[i];
     cur_indices = rem;
   }
   out_data[idx] = in_data[dst_offset];
 }
 
-template <typename T, typename Context>
-void FlipKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int>& axis,
-                DenseTensor* out) {
-  const auto gplace = dev_ctx.GetPlace();
-  auto cplace = phi::CPUPlace();
-  std::vector<int> flip_dims = axis;
-
+template <typename T, typename Context, size_t N>
+void launch_flip_cuda_kernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& axis,
+                             DenseTensor* out) {
+  std::vector<int> flip_dims_v = axis;
   auto* in_data = x.data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
-  const int flip_dims_size = static_cast<int>(flip_dims.size());
   auto x_dims = x.dims();
   const int total_dims = x_dims.size();
-  const int N = x.numel();
+  const int numel = x.numel();
 
   int block_size = 512;
   dim3 dim_block(block_size);
-  dim3 dim_grid((N + block_size - 1) / block_size);
+  dim3 dim_grid((numel + block_size - 1) / block_size);
 
-  for (size_t i = 0; i < flip_dims.size(); ++i) {
-    if (flip_dims[i] < 0) {
-      flip_dims[i] += total_dims;
+  for (size_t i = 0; i < flip_dims_v.size(); ++i) {
+    if (flip_dims_v[i] < 0) {
+      flip_dims_v[i] += total_dims;
     }
   }
 
   auto x_stride = phi::stride(x_dims);
-  std::vector<int64_t> x_dims_v = phi::vectorize(x_dims);
-  std::vector<int64_t> x_stride_v = phi::vectorize(x_stride);
-
-  int bytes = total_dims * sizeof(int64_t);
-  auto x_strides_array_tmp = paddle::memory::Alloc(
-      dev_ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int64_t* x_strides_array_gpu =
-      reinterpret_cast<int64_t*>(x_strides_array_tmp->ptr());
-  paddle::memory::Copy(gplace,
-                       x_strides_array_gpu,
-                       cplace,
-                       x_stride_v.data(),
-                       bytes,
-                       dev_ctx.stream());
-
-  auto x_shape_array_tmp = paddle::memory::Alloc(
-      dev_ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int64_t* x_shape_array_gpu =
-      reinterpret_cast<int64_t*>(x_shape_array_tmp->ptr());
-  paddle::memory::Copy(gplace,
-                       x_shape_array_gpu,
-                       cplace,
-                       x_dims_v.data(),
-                       bytes,
-                       dev_ctx.stream());
 
-  bytes = flip_dims_size * sizeof(int);
-  auto flip_dims_array_tmp = paddle::memory::Alloc(
-      dev_ctx.GetPlace(),
-      bytes,
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  int* flip_dims_array_gpu = reinterpret_cast<int*>(flip_dims_array_tmp->ptr());
-  paddle::memory::Copy(gplace,
-                       flip_dims_array_gpu,
-                       cplace,
-                       flip_dims.data(),
-                       bytes,
-                       dev_ctx.stream());
+  phi::Array<int64_t, N> stride_a;
+  phi::Array<int64_t, N> shape_a;
+  phi::Array<int, N> flip_dims_a;
+  size_t flip_dims_size = flip_dims_v.size();
+  for (size_t idx = 0; idx < N; ++idx) {
+    stride_a[idx] = x_stride[idx];
+    shape_a[idx] = x_dims[idx];
+    flip_dims_a[idx] = idx < flip_dims_size ? flip_dims_v[idx] : 0;
+  }
+  flip_cuda_kernel<T, N><<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(
+      numel, in_data, out_data, shape_a, stride_a, flip_dims_a, flip_dims_size);
+}
 
-  flip_cuda_kernel<T>
-      <<<dim_grid, dim_block, 0, dev_ctx.stream()>>>(N,
-                                                     in_data,
-                                                     out_data,
-                                                     x_shape_array_gpu,
-                                                     x_strides_array_gpu,
-                                                     flip_dims_array_gpu,
-                                                     flip_dims_size,
-                                                     total_dims);
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
+  const size_t total_dims = x.dims().size();
+  switch (total_dims) {
+    case 1:
+      launch_flip_cuda_kernel<T, Context, 1>(dev_ctx, x, axis, out);
+      break;
+    case 2:
+      launch_flip_cuda_kernel<T, Context, 2>(dev_ctx, x, axis, out);
+      break;
+    case 3:
+      launch_flip_cuda_kernel<T, Context, 3>(dev_ctx, x, axis, out);
+      break;
+    case 4:
+      launch_flip_cuda_kernel<T, Context, 4>(dev_ctx, x, axis, out);
+      break;
+    case 5:
+      launch_flip_cuda_kernel<T, Context, 5>(dev_ctx, x, axis, out);
+      break;
+    case 6:
+      launch_flip_cuda_kernel<T, Context, 6>(dev_ctx, x, axis, out);
+      break;
+    case 7:
+      launch_flip_cuda_kernel<T, Context, 7>(dev_ctx, x, axis, out);
+      break;
+    case 8:
+      launch_flip_cuda_kernel<T, Context, 8>(dev_ctx, x, axis, out);
+      break;
+    case 9:
+      launch_flip_cuda_kernel<T, Context, 9>(dev_ctx, x, axis, out);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "dims of input tensor should be less than 10, But received"
+          "%d",
+          x.dims().size()));
+  }
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index 684e9156609ad..120e908ae8cf7 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -20,11 +20,11 @@ limitations under the License. */
 namespace phi {
 
 template <typename InT, typename OutT = InT>
-struct FullFuctor {
+struct FullFunctor {
   OutT value;
 
   template <typename VType>
-  explicit inline FullFuctor(VType val) {
+  explicit inline FullFunctor(VType val) {
     value = static_cast<OutT>(val);
   }
 
@@ -50,7 +50,7 @@ void FullKernel(const Context& dev_ctx,
     // the data will not be loaded in the kernel because the number of
     // parameters in the operator is 0
     phi::funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
+        dev_ctx, inputs, &outputs, FullFunctor<T>(val.to<T>()));
   }
 }
 
@@ -104,7 +104,7 @@ void FullLikeKernel(const Context& dev_ctx,
   int numel = out->numel();
   if (numel > 0) {
     phi::funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, FullFuctor<T>(value));
+        dev_ctx, inputs, &outputs, FullFunctor<T>(value));
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 9783687ba5fb7..ff8fb1702075c 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -20,6 +20,8 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace phi {
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
@@ -79,6 +81,12 @@ void IndexAddKernel(const Context& ctx,
   // todo(@limin29): inplace do not need copy.
   phi::Copy(ctx, x, ctx.GetPlace(), false, output);
 
+  if (FLAGS_cudnn_deterministic) {
+    VLOG(2) << "Run grad kernel of index_add with single thread.";
+    block_dim = 1;
+    grid_dim.x = 1;
+  }
+
   if (index_type == phi::DataType::INT64) {
     const int64_t* index_data = index.data<int64_t>();
     index_add_cuda_kernel<T, int64_t>
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 3f6148c7efd80..d6139501b4e3c 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -85,7 +85,7 @@ struct FastDivMod {
 struct BroadcastConfig {
   FastDivMod divmoders[phi::DDim::kMaxRank];
   uint32_t strides[phi::DDim::kMaxRank];
-  int kDims;
+  int kDims{0};
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
diff --git a/paddle/phi/kernels/sparse/gpu/full_kernel.cu b/paddle/phi/kernels/sparse/gpu/full_kernel.cu
index d5ccfc95c6a00..223561bc17913 100644
--- a/paddle/phi/kernels/sparse/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/full_kernel.cu
@@ -23,11 +23,11 @@ limitations under the License. */
 namespace phi {
 
 template <typename InT, typename OutT = InT>
-struct FullFuctor {
+struct FullFunctor {
   OutT value;
 
   template <typename VType>
-  explicit inline FullFuctor(VType val) {
+  explicit inline FullFunctor(VType val) {
     value = static_cast<OutT>(val);
   }
 
@@ -54,7 +54,7 @@ void CooFullLikeKernel(const Context& dev_ctx,
   int numel = values->numel();
   if (numel > 0) {
     phi::funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
+        dev_ctx, inputs, &outputs, FullFunctor<T>(val.to<T>()));
   }
   out->set_dims(x.dims());
 }
@@ -80,7 +80,7 @@ void CsrFullLikeKernel(const Context& dev_ctx,
   int numel = values->numel();
   if (numel > 0) {
     phi::funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
+        dev_ctx, inputs, &outputs, FullFunctor<T>(val.to<T>()));
   }
   out->set_dims(x.dims());
 }
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 25a986ea82fb0..be232b7c671e9 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -121,8 +121,10 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
       OneDNNContext::tls().set_cur_paddle_data_layout(src_layout);
     }
 
-    out->set_layout(DataLayout::ONEDNN);
-    out->set_format(out_format);
+    dnnl::memory::desc out_mem_desc(vectorize<int64_t>(out->dims()),
+                                    funcs::ToOneDNNDataType(x.dtype()),
+                                    out_format);
+    out->set_mem_desc(out_mem_desc);
   } else if (src_layout == DataLayout::ONEDNN &&
              dst_layout != DataLayout::ONEDNN) {
     // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ddb888dfaa01f..b94da64f4559c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -272,7 +272,6 @@ EOF
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} \
         -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} \
@@ -546,23 +545,26 @@ EOF
 }
 
 
-function combine_avx_noavx_build() {
-    mkdir -p ${PADDLE_ROOT}/build.noavx
-    cd ${PADDLE_ROOT}/build.noavx
-    WITH_AVX=OFF
+function avx_build() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    WITH_AVX=ON
+
     cmake_base ${PYTHON_ABI:-""}
     build_base
+}
 
-    # build combined one
+
+function noavx_build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    NOAVX_CORE_FILE=`find ${PADDLE_ROOT}/build.noavx/python/paddle/fluid/ -name "core_noavx.*"`
-    WITH_AVX=ON
+    WITH_AVX=OFF
 
     cmake_base ${PYTHON_ABI:-""}
     build_base
 }
 
+
 function mac_m1_arm_build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -989,7 +991,7 @@ function generate_upstream_develop_api_spec() {
     mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/
     echo "pr_whl_size: ${pr_whl_size}"
 
-    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
+    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt ${PADDLE_ROOT}/build/python/paddle
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
 
     cd ${PADDLE_ROOT}
@@ -2887,12 +2889,12 @@ EOF
     local LIB_TYPE=$1
     case $LIB_TYPE in
       full)
-        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
-        make -j `nproc` framework_py_proto copy_paddle_pybind paddle_python
+        # Build full Paddle Python module. Will timeout without caching 'copy_libpaddle' first
+        make -j `nproc` framework_py_proto copy_libpaddle paddle_python
         ;;
       pybind)
         # Build paddle pybind library. Takes 49 minutes to build. Might timeout
-        make -j `nproc` copy_paddle_pybind
+        make -j `nproc` copy_libpaddle
         ;;
       proto)
         # Even smaller library.
@@ -3365,7 +3367,7 @@ function build_pr_and_develop() {
     mkdir ${PADDLE_ROOT}/build/pr_whl && cp ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl
     rm -f ${PADDLE_ROOT}/build/python/dist/*.whl && rm -f ${PADDLE_ROOT}/build/python/build/.timestamp
     if [[ ${cmake_change} ]];then
-        rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
+        rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt ${PADDLE_ROOT}/build/python/paddle
         rm -rf ${PADDLE_ROOT}/build/third_party
     fi
 
@@ -3485,16 +3487,25 @@ function main() {
         gen_dockerfile ${PYTHON_ABI:-""}
         assert_api_spec_approvals
         ;;
-      combine_avx_noavx)
-        combine_avx_noavx_build
+      avx_build)
+        avx_build
+        gen_dockerfile ${PYTHON_ABI:-""}
+        ;;
+      noavx_build)
+        noavx_build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
       mac_m1_arm)
         mac_m1_arm_build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      combine_avx_noavx_build_and_test)
-        combine_avx_noavx_build
+      avx_build_and_test)
+        avx_build
+        gen_dockerfile ${PYTHON_ABI:-""}
+        parallel_test_base
+        ;;
+      noavx_build_and_test)
+        noavx_build
         gen_dockerfile ${PYTHON_ABI:-""}
         parallel_test_base
         ;;
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 9a2ed349e5b92..0aeacfef7f9bd 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -82,8 +82,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
-cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -184,4 +184,4 @@ exit /b 1
 :END
 echo BUILD SUCCESSFULLY
 
-ENDLOCAL
\ No newline at end of file
+ENDLOCAL
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e33af8b1bd52e..b935fb78f4e90 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,28 +20,7 @@ endif()
 
 set(SETUP_LOG_FILE "setup.py.log")
 
-set(FLUID_CORE_NAME "core")
-if(WITH_AVX AND AVX_FOUND)
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
-  if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(
-      STATUS
-        "MESSAGE: This is just a message for publishing release.
-      You are building AVX version without NOAVX core.
-      So the wheel package may fail on NOAVX machine.
-      You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
-      to get a full wheel package to resolve this warning.
-      While, this version will still work on local machine.")
-  endif()
-
-  if(NOAVX_CORE_FILE AND NOT EXISTS "${NOAVX_CORE_FILE}")
-    message(FATAL_ERROR "The file ${NOAVX_CORE_FILE} does not exist!")
-  endif()
-
-  set(HAS_NOAVX_CORE ON)
-else()
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_noavx")
-endif()
+set(FLUID_CORE_NAME "libpaddle")
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
                ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -55,48 +34,20 @@ if(WIN32)
 
   add_custom_command(
     OUTPUT ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
-    DEPENDS paddle_pybind)
-
-  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    COMMAND cmake -E copy $<TARGET_FILE:libpaddle> ${FLUID_CORE}
+    COMMAND cmake -E copy $<TARGET_LINKER_FILE:libpaddle> ${FLUID_CORE_LIB}
+    DEPENDS libpaddle)
 else()
   set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
   add_custom_command(
     OUTPUT ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-    DEPENDS paddle_pybind)
-
-  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
+    COMMAND cmake -E copy $<TARGET_FILE:libpaddle> ${FLUID_CORE}
+    DEPENDS libpaddle)
 endif()
 
 set(FLUID_CORE_DEPS ${FLUID_CORE})
 
-if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
-  get_filename_component(NOAVX_CORE_NAME ${NOAVX_CORE_FILE} NAME)
-  get_filename_component(NOAVX_CORE_EXT ${NOAVX_CORE_FILE} EXT)
-  if(WIN32)
-    if(NOT NOAVX_CORE_EXT STREQUAL ".pyd")
-      message(
-        FATAL_ERROR
-          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!"
-      )
-    endif()
-  else()
-    if(NOT NOAVX_CORE_EXT STREQUAL ".so")
-      message(
-        FATAL_ERROR
-          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
-    endif()
-  endif()
-  add_custom_command(
-    OUTPUT ${FLUID_NOAVX_CORE}
-    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE}
-    DEPENDS paddle_pybind)
-  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
-endif()
-
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
+add_custom_target(copy_libpaddle ALL DEPENDS ${FLUID_CORE_DEPS})
 
 if(WIN32)
   add_custom_command(
@@ -107,8 +58,8 @@ if(WIN32)
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py
             bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
-            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+            pass_desc_py_proto ${PY_FILES})
 else()
   add_custom_command(
     OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -116,8 +67,8 @@ else()
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
-            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+            pass_desc_py_proto ${PY_FILES})
 endif()
 
 add_custom_target(paddle_python ALL
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 8bc7b11368680..70fc9647cd489 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -26,9 +26,11 @@
     from .py_layer import LegacyPyLayerContext as PyLayerContext  # noqa: F401
 from ..framework import set_grad_enabled, is_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
+from .saved_tensors_hooks import saved_tensors_hooks
 
 __all__ = [  # noqa
     'backward',
     'PyLayer',
     'PyLayerContext',
+    'saved_tensors_hooks',
 ]
diff --git a/python/paddle/autograd/saved_tensors_hooks.py b/python/paddle/autograd/saved_tensors_hooks.py
new file mode 100644
index 0000000000000..1e0f292d68b49
--- /dev/null
+++ b/python/paddle/autograd/saved_tensors_hooks.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+
+__all__ = []
+
+
+class saved_tensors_hooks():
+    """
+    Dynamic graph, registers a pair of pack / unpack hooks for saved tensors.
+    
+    Parameters:
+        pack_hook (function): The pack hook will be called every time the forward
+            operation inputs/outputs tensors need be saved for backward. Then you 
+            can save it to CPU or Disk. The input of `pack_hook` is a tensor need
+            be saved. The output of `pack_hook` is then stored information instead
+            of the original tensor. `pack_hook` will also be called while any
+            tensor need be saved by `PyLayerContext.save_for_backward`. If a tensor
+            saved for backward is no need buffer, `pack_hook` will not be called.
+            Only the thensor saved for backward is LoDTensor, `pack_hook` will be
+            called.
+        unpack_hook (function): The unpack hook will be called every time the
+            backward need use the saved inputs/outputs tensors. Then you can reload
+            the tensor and return it to paddle framework. The input of `unpack_hook`
+            is the information returned by `pack_hook`. The output of `unpack_hook`
+            is a tensor reloaded by the information, and the tensor mast has the same
+            content as the original tensor passed as input to the corresponding 
+            `pack_hook`.
+
+    Returns:
+            None
+
+    Examples:
+        .. code-block:: python
+
+        # Example1
+        import paddle
+
+        def pack_hook(x):
+            print("Packing", x)
+            return x.numpy()
+
+        def unpack_hook(x):
+            print("UnPacking", x)
+            return paddle.to_tensor(x)
+
+        a = paddle.ones([3,3])
+        b = paddle.ones([3,3]) * 2
+        a.stop_gradient = False
+        b.stop_gradient = False
+        with paddle.autograd.saved_tensors_hooks(pack_hook, unpack_hook):
+            y = paddle.multiply(a, b)
+        y.sum().backward()
+
+        # Example2
+        import paddle
+        from paddle.autograd import PyLayer
+
+        class cus_multiply(PyLayer):
+            @staticmethod
+            def forward(ctx, a, b):
+                y = paddle.multiply(a, b)
+                ctx.save_for_backward(a, b)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy):
+                a,b = ctx.saved_tensor()
+                grad_a = dy * a
+                grad_b = dy * b
+                return grad_a, grad_b
+
+        def pack_hook(x):
+            print("Packing", x)
+            return x.numpy()
+
+        def unpack_hook(x):
+            print("UnPacking", x)
+            return paddle.to_tensor(x)
+
+        a = paddle.ones([3,3])
+        b = paddle.ones([3,3]) * 2
+        a.stop_gradient = False
+        b.stop_gradient = False
+        with paddle.autograd.saved_tensors_hooks(pack_hook, unpack_hook):
+            y = cus_multiply.apply(a, b)
+        y.sum().backward()
+    """
+
+    def __init__(self, pack_hook, unpack_hook):
+        self.pack_hook = pack_hook
+        self.unpack_hook = unpack_hook
+
+    def __enter__(self):
+        core.eager.register_saved_tensors_hooks(self.pack_hook,
+                                                self.unpack_hook)
+
+    def __exit__(self, *args):
+        core.eager.reset_saved_tensors_hooks()
diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 4dc68edfe2d55..269a0ec644dbd 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .interface import shard_tensor  # noqa: F401
-from .interface import shard_op  # noqa: F401
+from .strategy import Strategy
 from .process_mesh import ProcessMesh
-from .reshard import Resharder  # noqa: F401
-from .cost_model import estimate_cost
+from .engine import Engine
+from .interface import shard_tensor
+from .interface import shard_op
+from .interface import recompute
+from .interface import fetch
 
 __all__ = []
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
new file mode 100644
index 0000000000000..82b3d4554b76a
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from collections import defaultdict
+
+# _g_default_config[category][field] = default_value
+_g_default_config = defaultdict(dict)
+
+
+def get_category_default_config(category):
+    return _g_default_config[category]
+
+
+def set_category_default_config(category, default_value):
+    _g_default_config[category] = default_value
+
+
+def get_field_default_config(category, field):
+    return _g_default_config[category][field]
+
+
+def set_field_default_config(category, field, default_value):
+    _g_default_config[category][field] = default_value
+
+
+NOT_FOUND = "not_found"
+
+#########################################
+# base configuration
+#########################################
+BASE = "base"
+set_field_default_config(BASE, "auto_mode", "semi")
+set_field_default_config(BASE, "gradient_scale", True)
+set_field_default_config(BASE, "use_cache", True)
+set_field_default_config(BASE, "return_numpy", True)
+set_field_default_config(BASE, "all_ranks", False)
+set_field_default_config(BASE, "split_data", False)
+set_field_default_config(BASE, "seed", None)
+set_field_default_config(BASE, "reinit", False)  # Only for debug
+
+#########################################
+# recompute configuration
+#########################################
+RECOMPUTE = "recompute"
+set_field_default_config(RECOMPUTE, "enable", False)
+set_field_default_config(RECOMPUTE, "checkpoints", None)
+set_field_default_config(RECOMPUTE, "enable_tuning", False)
+
+#########################################
+# AMP configuration
+#########################################
+AMP = "amp"
+set_field_default_config(AMP, "enable", False)
+set_field_default_config(AMP, "init_loss_scaling", 32768.0)
+set_field_default_config(AMP, "incr_every_n_steps", 1000)
+set_field_default_config(AMP, "decr_every_n_nan_or_inf", 2)
+set_field_default_config(AMP, "incr_ratio", 2.0)
+set_field_default_config(AMP, "decr_ratio", 0.8)
+set_field_default_config(AMP, "use_dynamic_loss_scaling", True)
+set_field_default_config(AMP, "custom_white_list", [])
+set_field_default_config(AMP, "custom_black_list", [])
+set_field_default_config(AMP, "custom_black_varnames", [])
+set_field_default_config(AMP, "use_pure_fp16", False)
+set_field_default_config(AMP, "use_fp16_guard", True)
+set_field_default_config(AMP, "use_optimizer_fp16", False)
+
+#########################################
+# sharding configuration
+#########################################
+SHARDING = "sharding"
+set_field_default_config(SHARDING, "enable", False)
+set_field_default_config(SHARDING, "stage", 1)
+set_field_default_config(SHARDING, "sharding_degree", 8)
+set_field_default_config(SHARDING, "segment_broadcast_MB", 32.0)
+set_field_default_config(SHARDING, "enable_tuning", False)
+set_field_default_config(SHARDING, "tuning_range", [])
+
+#########################################
+# gradient merge configuration
+#########################################
+GRADIENT_MERGE = "gradient_merge"
+set_field_default_config(GRADIENT_MERGE, "enable", False)
+set_field_default_config(GRADIENT_MERGE, "k_steps", 1)
+set_field_default_config(GRADIENT_MERGE, "avg", True)
+
+#########################################
+# quantization configuration
+#########################################
+QAT = "qat"
+set_field_default_config(QAT, "enable", False)
+set_field_default_config(QAT, "channel_wise_abs_max", True)
+set_field_default_config(QAT, "weight_bits", 8)
+set_field_default_config(QAT, "activation_bits", 8)
+set_field_default_config(QAT, "not_quant_pattern", ['skip_quant'])
+set_field_default_config(QAT, "algo", None)
+
+# #########################################
+# auto tuning configuration
+# #########################################
+TUNING = "tuning"
+set_field_default_config(TUNING, "enable", False)
+set_field_default_config(TUNING, "batch_size", 1)
+set_field_default_config(TUNING, "dataset", None)
+set_field_default_config(TUNING, "profile_start_step", 1)
+set_field_default_config(TUNING, "profile_end_step", 1)
+set_field_default_config(TUNING, "run_after_tuning", True)
+set_field_default_config(TUNING, "verbose", True)
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index ff07deb42aad3..92d0304eaf613 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -173,6 +173,17 @@ def mark_annotated_as(self, dist_attr):
     def clear_annotated(self):
         self._is_annotated.clear()
 
+    def __eq__(self, other):
+        if not isinstance(other, TensorDistributedAttribute):
+            return False
+        if self.process_mesh != other.process_mesh:
+            return False
+        if self.dims_mapping != other.dims_mapping:
+            return False
+        if self._is_annotated != other._is_annotated:
+            return False
+        return True
+
     def __str__(self):
         str = "\n\ttensor_dist_attr = {"
         if self.is_annotated("process_mesh"):
@@ -486,6 +497,27 @@ def is_annotated_output_dims_mapping(self, name):
         else:
             return False
 
+    def __eq__(self, other):
+        if not isinstance(other, OperatorDistributedAttribute):
+            return False
+        if self.process_mesh != other.process_mesh:
+            return False
+        if self.op_type != other.op_type:
+            return False
+        if self.impl_type != other.impl_type:
+            return False
+        if self.impl_idx != other.impl_idx:
+            return False
+        if self._is_annotated != other._is_annotated:
+            return False
+        if self._is_recompute != other._is_recompute:
+            return False
+        if self.inputs_dist_attrs != other.inputs_dist_attrs:
+            return False
+        if self.outputs_dist_attrs != other.outputs_dist_attrs:
+            return False
+        return True
+
     def __str__(self):
         str = "\n\top_dist_attr = {"
         if self.is_annotated("process_mesh"):
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 92a503659041e..d1f00e8a7ba4f 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -126,9 +126,6 @@ def __init__(self,
         # A flag indicates whether the used parallelism is data parallel
         self._data_parallel = False
 
-        # flag whether using `to_static`
-        self._dygraph_mode = False
-
     @property
     def serial_main_program(self):
         return self._serial_main_program
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index b6a77b778885f..300c80ec71878 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -23,6 +23,7 @@
 from .dist_attribute import append_op_output_suffix
 from .dist_attribute import get_tensor_dist_attr_field_keys
 from .dist_attribute import get_op_dist_attr_field_keys
+from .utils import convert_to_shard_spec, verify_shard_spec
 
 
 class DistributedOperator:
@@ -248,23 +249,106 @@ def __deepcopy__(self, memo):
         return result
 
 
-class DistributedModule:
+class DistributedOperatorHelper:
 
-    def __init__(self, serial_module, dist_attr=None):
-        self._serial_module = serial_module
-        self._dist_attr = dist_attr
+    def __init__(self, serial_op, process_mesh, in_dims_mappings,
+                 out_dims_mappings):
+        self._serial_op = serial_op
+        self._process_mesh = process_mesh
+        self._in_dims_mappings = in_dims_mappings
+        self._out_dims_mappings = out_dims_mappings
 
     def __call__(self, *args, **kwargs):
-        from .dist_context import get_default_distributed_context
+        tensor_to_dims_mapping = {}
+        index = 0
+        if self._in_dims_mappings:
+            assert len(args) + len(kwargs) == len(self._in_dims_mappings), \
+                "The length of dims_mapping {} does not matching the length output {}.".format(len(self._in_dims_mappings), len(args) + len(kwargs))
+        for arg in args:
+            if isinstance(arg, Variable) and self._in_dims_mappings:
+                tensor_to_dims_mapping[arg.name] = self._in_dims_mappings[index]
+            index += 1
+        for arg in kwargs.values() and self._in_dims_mappings:
+            if isinstance(arg, Variable):
+                tensor_to_dims_mapping[arg.name] = self._in_dims_mappings[index]
+            index += 1
+
         default_prog = paddle.fluid.default_main_program()
         cur_block = default_prog.current_block()
         op_size = len(cur_block.ops)
-        output = self._serial_module(*args, **kwargs)
+        output = self._serial_op(*args, **kwargs)
         new_op_size = len(cur_block.ops)
+
+        if isinstance(output, tuple) or isinstance(output, list):
+            new_output = list(output)
+        elif isinstance(output, Variable):
+            new_output = [output]
+        else:
+            raise ValueError("Unrecognized outpout.")
+
+        if self._out_dims_mappings:
+            assert len(new_output) == len(self._out_dims_mappings), \
+                "The length of dims_mapping {} does not matching the length output {}.".format(len(self._out_dims_mappings), len(new_output))
+        for i, item in enumerate(new_output):
+            if isinstance(item, Variable) and self._out_dims_mappings:
+                tensor_to_dims_mapping[item.name] = self._out_dims_mappings[i]
+
+        from .dist_context import get_default_distributed_context
         default_dist_ctx = get_default_distributed_context()
         for idx in range(op_size, new_op_size):
             op = cur_block.ops[idx]
-            dist_op = DistributedOperator(op, self._dist_attr)
-            dist_op.dist_attr.mark_annotated_as(self._dist_attr)
+            dist_op = DistributedOperator(op)
+            for name in dist_op.serial_op.input_arg_names:
+                if name in tensor_to_dims_mapping.keys():
+                    tensor = dist_op.get_serial_input(name)
+                    tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(
+                        name)
+                    dims_mapping = tensor_to_dims_mapping[name]
+                    if tensor is None:
+                        tensor_shape = []
+                    else:
+                        if tensor.type == core.VarDesc.VarType.READER \
+                            or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                            or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+                            tensor_shape = []
+                        else:
+                            tensor_shape = tensor.shape
+                    if dims_mapping is not None:
+                        dims_mapping = tensor_to_dims_mapping[name]
+                        shard_spec = convert_to_shard_spec(
+                            dims_mapping, self._process_mesh)
+                        assert verify_shard_spec(shard_spec, tensor_shape, self._process_mesh), \
+                            "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+                                name, shard_spec, tensor_shape, self._process_mesh)
+                        tensor_dist_attr.dims_mapping = dims_mapping
+                        tensor_dist_attr.mark_annotated("dims_mapping")
+            for name in dist_op.serial_op.output_arg_names:
+                if name in tensor_to_dims_mapping.keys():
+                    tensor = dist_op.get_serial_output(name)
+                    tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+                        name)
+                    dims_mapping = tensor_to_dims_mapping[name]
+                    if tensor is None:
+                        tensor_shape = []
+                    else:
+                        if tensor.type == core.VarDesc.VarType.READER \
+                            or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+                            or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+                            tensor_shape = []
+                        else:
+                            tensor_shape = tensor.shape
+                    if dims_mapping is not None:
+                        dims_mapping = tensor_to_dims_mapping[name]
+                        shard_spec = convert_to_shard_spec(
+                            dims_mapping, self._process_mesh)
+                        assert verify_shard_spec(shard_spec, tensor_shape, self._process_mesh), \
+                            "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+                                name, shard_spec, tensor_shape, self._process_mesh)
+                        tensor_dist_attr.dims_mapping = dims_mapping
+                        tensor_dist_attr.mark_annotated("dims_mapping")
+            dist_op.dist_attr.process_mesh = self._process_mesh
+            if self._process_mesh is not None:
+                dist_op.dist_attr.mark_annotated("process_mesh")
             default_dist_ctx.add_dist_op_for_program(dist_op)
+
         return output
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/dist_saver.py
index 265a746fc8c2b..350e5ac44e724 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -59,6 +59,14 @@ def __init__(self):
 
     def save(self, path, serial_program, dist_main_program, dist_context):
 
+        def _save_state(program, path, mode="param"):
+            state = {
+                k: np.array(v)
+                for k, v in program.state_dict(mode).items()
+            }
+            with open(path, "wb") as f:
+                pickle.dump(state, f)
+
         dirname, filename = _process_path(path)
 
         rank_id = paddle.distributed.get_rank()
@@ -76,16 +84,6 @@ def save(self, path, serial_program, dist_main_program, dist_context):
         with open(dist_model_path, "wb") as f:
             f.write(dist_main_program.desc.serialize_to_string())
 
-        # save distributed params
-        dist_param_filename = filename + "_dist" + str(rank_id) + ".pdparams"
-        dist_param_path = os.path.join(dirname, dist_param_filename)
-        dist_param = {
-            k: np.array(v)
-            for k, v in dist_main_program.state_dict().items()
-        }
-        with open(dist_param_path, "wb") as f:
-            pickle.dump(dist_param, f)
-
         # save distributed attribute
         dist_attr_filename = filename + "_dist" + str(rank_id) + ".pdattr"
         dist_attr_path = os.path.join(dirname, dist_attr_filename)
@@ -93,65 +91,69 @@ def save(self, path, serial_program, dist_main_program, dist_context):
         with open(dist_attr_path, "wb") as f:
             pickle.dump(dist_attrs, f)
 
+        # save distributed params
+        dist_param_filename = filename + "_dist" + str(rank_id) + ".pdparams"
+        dist_param_path = os.path.join(dirname, dist_param_filename)
+        _save_state(dist_main_program, dist_param_path)
+
+        # save distributed opt states
+        dist_opt_filename = filename + "_dist" + str(rank_id) + ".pdopt"
+        dist_opt_path = os.path.join(dirname, dist_opt_filename)
+        _save_state(dist_main_program, dist_opt_path, "opt")
+
         # TODO:save cluster.json
 
-    def load(self,
-             path,
-             program,
-             dist_context,
-             strict=True,
-             load_optimizer=True):
+    def load(self, path, load_optimizer=True):
         # TODO: if `program` is None, load `path.pdmodel`.
+        def _load_file(filename, dirname, suffix="pdparams"):
+            file_list = []
+            for file in os.listdir(dirname):
+                if check_filename('{}(.*)_dist(.*).{}'.format(filename, suffix),
+                                  file):
+                    file_list.append(os.path.join(dirname, file))
+            file_list.sort()
+            return file_list
+
+        def _load_state(filename, dirname, suffix="pdparams"):
+            file_list = _load_file(filename, dirname, suffix)
+            state_dict = {}
+            for file in file_list:
+                with open(file, 'rb') as f:
+                    state_dict_info = pickle.load(f, encoding='latin1')
+                for name, value in state_dict_info.items():
+                    if name in state_dict:
+                        state_dict[name].append(np.array(value))
+                    else:
+                        state_dict[name] = [np.array(value)]
+            self._logger.info("Load param file: {}".format(file_list))
+            return state_dict
+
         filename = os.path.basename(path)
         if filename == "":
             raise ValueError(
                 "path should be of 'dirname/filename' format, but received filename is empty string"
             )
         dirname = os.path.dirname(path)
-        # load path.pdparam
-        param_file_list = []
-        for param_file in os.listdir(dirname):
-            if check_filename('{}(.*)_dist(.*).pdparams'.format(filename),
-                              param_file):
-                param_file_list.append(os.path.join(dirname, param_file))
-        param_file_list.sort()
-        self._logger.info(
-            "Load distributed attribute file: {}".format(param_file_list))
-        param_dict = {}
-        for param_file in param_file_list:
-            with open(param_file, 'rb') as f:
-                state_dict_info = pickle.load(f, encoding='latin1')
-            for name, value in state_dict_info.items():
-                if name in param_dict:
-                    param_dict[name].append(np.array(value))
-                else:
-                    param_dict[name] = [np.array(value)]
+
+        # load path.pdparam and path.pdopt
+        param_state_dict = _load_state(filename, dirname)
+        opt_state_dict = _load_state(filename, dirname,
+                                     "pdopt") if load_optimizer else {}
+        state_dict = dict(param_state_dict, **opt_state_dict)
 
         # load path.pdattr
-        dist_attr_file_list = []
-        for dist_attr_file in os.listdir(dirname):
-            if check_filename('{}(.*)_dist(.*).pdattr'.format(filename),
-                              dist_attr_file):
-                dist_attr_file_list.append(os.path.join(dirname,
-                                                        dist_attr_file))
-        dist_attr_file_list.sort()
+        dist_attr_file_list = _load_file(filename, dirname, "pdattr")
         self._logger.info(
             "Load distributed attribute file: {}".format(dist_attr_file_list))
-        pre_dist_attr = {}
+        dist_attr = {}
         for dist_attr_file in dist_attr_file_list:
             with open(dist_attr_file, 'rb') as f:
-                dist_attr = pickle.load(f, encoding='latin1')
-            for name, attr in dist_attr.items():
-                if name not in pre_dist_attr:
-                    pre_dist_attr[name] = attr
-
-        # get current dist_attr
-        cur_dist_attr = get_dist_attr(program, dist_context)
-
-        # param convert
-        converter = Converter(param_dict, pre_dist_attr, cur_dist_attr)
-        param_dict = converter.convert(strict=strict)
-        program.set_state_dict(param_dict)
+                dist_attr_info = pickle.load(f, encoding='latin1')
+            for name, attr in dist_attr_info.items():
+                if name not in dist_attr:
+                    dist_attr[name] = attr
+
+        return state_dict, dist_attr
 
     def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
 
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 5389438d388a5..ee6bee45fd7fe 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -12,76 +12,169 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import time
 import copy
 import logging
+import random
+import numpy as np
 from collections import defaultdict
 
 import paddle
 import paddle.utils as utils
 
 from paddle import fluid, static
-from paddle.io import Dataset
 from paddle.jit import to_static
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
-from paddle.fluid import program_guard
+from paddle.fluid import Variable
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.executor import global_scope, _to_name_str
-from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import Operator, Parameter, _non_static_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
-from paddle.distributed.passes import new_pass, PassContext
 
+from .converter import Converter
 from .helper import ProgramHelper
-from ..collective import _get_global_env
 from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .dist_loader import NonIterableGeneratorLoader
-from .utils import make_data_unshard, set_grad_var_shape
 from .utils import print_program_with_dist_attr, to_list
-from .process_group import new_process_group, get_all_process_groups, get_world_process_group
+from .utils import get_logger, get_dist_attr
+from .process_group import new_process_group, get_all_process_groups
 from .dist_context import DistributedContext, get_default_distributed_context
+from .strategy import Strategy
+from .interface import _get_fetches
 
 
 class Engine:
+    """
+    An Engine object can provide the full power of auto parallel to users. 
+    With the help of it, users can easily obtain the abilities of the 
+    distributed training and inference. It also support the dynamic graph and
+    static graph at the same time.
+
+    Args:
+        model (paddle.nn.Layer, optional): The model is an instance of
+            paddle.nn.Layer.
+        loss (Loss|Callable|None, optional): The loss can be a `paddle.nn.Layer`
+            instance or any callable function taken the predicted values and 
+            ground truth values as input. It can be None when there is no loss. 
+            Default: None.
+        optimizer (Optimizer|None, optional): The optimizer need to be set in training
+            and should be None in eval and predict mode. Default: None.
+        metrics (Metric|list[Metric]|None, optional): If metrics is set, all
+            metrics will be calculated and output in train/eval mode. Default: None.
+        cluster (Cluster|None, optional): The cluster represents the topology information
+            about the used physical devices. Default: None. (Unused for now)
+        strategy (Strategy|None, optional): The strategy is used to configure the
+        parallelization and optimization behaviors. Default: None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.vision.transforms as T
+            import paddle.distributed.auto_parallel as auto
+            from paddle.vision.datasets import MNIST
+
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            train_dataset = MNIST(mode='train', transform=transform)
+            valid_dataset = MNIST(mode='test', transform=transform)
+
+            model = paddle.vision.models.LeNet()
+            loss = paddle.nn.CrossEntropyLoss() 
+            optimizer = paddle.optimizer.Adam(
+                learning_rate=0.001, parameters=model.parameters())
+            metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+            engine = auto.Engine(model, loss, optimizer, metrics) 
+            # fit 
+            engine.fit(train_dataset,
+                       epochs=2,
+                       batch_size=64)
+            # evaluate 
+            engine.evaluate(valid_dataset,
+                            batch_size=64)
+            # predict
+            engine.predict(valid_dataset,
+                           batch_size=64)
+            # save
+            engine.save("./my_model")
+            # load 
+            engine.load("./my_model")
+
+    """
 
     def __init__(self,
                  model=None,
-                 inputs_spec=None,
-                 labels_spec=None,
+                 loss=None,
+                 optimizer=None,
+                 metrics=None,
                  cluster=None,
-                 strategy=None,
-                 user_tuning_config=None):
-        self.model = model
-        self.strategy = strategy or fleet.DistributedStrategy()
-        self.inputs_spec = self._validate_spec(inputs_spec)
-        self.labels_spec = self._validate_spec(labels_spec)
-        self.cluster = cluster or get_default_cluster()
-        self._user_tuning_config = user_tuning_config
+                 strategy=None):
+
+        if model and not isinstance(model,
+                                    paddle.nn.Layer) and not callable(model):
+            raise TypeError(
+                "'model must be sub classes of `paddle.nn.Layer` or any callable function."
+            )
+        self._model = model
+
+        if loss and not isinstance(loss,
+                                   paddle.nn.Layer) and not callable(loss):
+            raise TypeError(
+                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
+            )
+        self._loss = loss
+
+        if optimizer and not isinstance(
+                optimizer,
+            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
+            raise TypeError(
+                "'optimizer' must be object of class `paddle.optimizer.Optimizer`"
+                " or `paddle.fluid.optimizer.Optimizer`.")
+        self._optimizer = self._validate_opt(optimizer)
+
+        metrics = metrics or []
+        for metric in to_list(metrics):
+            assert isinstance(metric, Metric), \
+                "{} is not sub class of Metric".format(
+                    metric.__class__.__name__)
+        self._metrics = to_list(metrics)
+
+        if cluster and not isinstance(cluster, Cluster):
+            raise TypeError(
+                "'cluster' must be the object or class `paddle.distributed.auto_parallel.Cluster`"
+            )
+        self._cluster = cluster or get_default_cluster()
+
+        if strategy and not isinstance(strategy, Strategy):
+            raise TypeError(
+                "'strategy' must be object of class `paddle.distributed.auto_parallel.Strategy`"
+            )
+        self._strategy = strategy or Strategy()
+
+        if os.getenv("POD_NAME"):
+            print("Distribute training by paddle.distributed.launch",
+                  flush=True)
+            fleet.init(is_collective=True)
 
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
         self._nranks = paddle.distributed.get_world_size()
         self._saver = DistributedSaver()
 
-        # TODO: add logger module
-        self._logger = logging.getLogger()
-        self._logger.propagate = False
-        if not self._logger.handlers:
-            self._logger.setLevel(logging.INFO)
-            log_handler = logging.StreamHandler()
-            log_format = logging.Formatter(
-                '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
-            )
-            log_handler.setFormatter(log_format)
-            self._logger.addHandler(log_handler)
+        self._logger = get_logger(logging.INFO)
 
         self._orig_main_prog = static.default_main_program()
         self._orig_startup_prog = static.default_startup_program()
@@ -99,54 +192,18 @@ def __init__(self,
             "eval": False,
             "predict": False
         }
-        self._dygraph_mode = False
-
-    def prepare(self,
-                optimizer=None,
-                loss=None,
-                gradient_scale=True,
-                metrics=None,
-                all_ranks=False):
-        if optimizer and not isinstance(
-                optimizer,
-            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
-            raise TypeError(
-                    "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
-                        " or `paddle.fluid.optimizer.Optimizer`."
-                )
-        self._optimizer = self._validate_opt(optimizer)
-
-        if loss and not isinstance(loss,
-                                   paddle.nn.Layer) and not callable(loss):
-            raise TypeError(
-                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
-            )
-        self._loss = loss
 
-        metrics = metrics or []
-        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
-        self._metrics = to_list(metrics)
-        self._gradient_scale = gradient_scale
         self._planned_mode = None
-        self._all_ranks = all_ranks
-        self._prepare_single_mode("train")
+        self._dygraph_mode = False
+        self._tuning = self._strategy.tuning
 
     def _prepare_single_mode(self, mode):
-
+        # Do the build process
         self._build(mode)
         # Do the planning process
         self._plan(mode)
-
-        # Do the Optimization tuning
-        if self._user_tuning_config and mode == "train":
-            self._optimization_tuning(mode)
-
         # Do the parallel process
-        self._parallel(mode, self._all_ranks)
-
+        self._parallel(mode)
         # Init comm and startup program
         self._initialize(mode)
         self._mode_init_states[mode] = True
@@ -159,7 +216,7 @@ def _build(self, mode):
 
             inputs_spec = self.inputs_spec
             labels_spec = self.labels_spec if self.labels_spec else []
-            self.program_helper = ProgramHelper(self.model, self._loss,
+            self.program_helper = ProgramHelper(self._model, self._loss,
                                                 self._metrics, inputs_spec,
                                                 labels_spec)
             # build forward main program
@@ -186,14 +243,13 @@ def _build(self, mode):
             metrics = []
             serial_main_prog = self._orig_main_prog.clone()
             serial_startup_prog = self._orig_startup_prog.clone()
-            # FIXME to support grad clip
             with static.program_guard(serial_main_prog, serial_startup_prog), \
                 utils.unique_name.guard():
                 inputs_spec = self.inputs_spec
                 labels_spec = self.labels_spec if self.labels_spec else []
                 inputs = [s._create_feed_layer() for s in inputs_spec]
                 labels = [s._create_feed_layer() for s in labels_spec]
-                outputs = to_list(self.model(*inputs))
+                outputs = to_list(self._model(*inputs))
                 if mode != "predict" and self._loss:
                     losses = to_list(self._loss(*(outputs + labels)))
 
@@ -217,25 +273,30 @@ def _build(self, mode):
             "metrics": metrics
         }
 
+        if mode != "train":
+            serial_main_prog = serial_main_prog.clone(for_test=True)
+
         self._set_recompute_ckpts()
         self._dist_contexts[mode] = DistributedContext(
             serial_main_prog, serial_startup_prog, self._optimizer, losses,
-            feed_vars, fetch_vars, self.cluster, self.strategy)
-        self._dist_contexts[mode].gradient_scale = self._gradient_scale
-        self._dist_contexts[mode]._dygraph_mode = self._dygraph_mode
+            feed_vars, fetch_vars, self._cluster, self._strategy)
+        self._dist_contexts[mode].gradient_scale = self._strategy.gradient_scale
 
-    def _optimization_tuning(self, mode):
+    def _optimization_tuning(self, mode, dataset, batch_size):
+        if not self._tuning.enable:
+            raise ValueError("Please set `tuning.enable=True`.")
 
-        self.mode = mode
-        assert "batch_size" in self._user_tuning_config, "Optimization Tuning should provide with batch size."
-        assert "dataset" in self._user_tuning_config, "Optimization Tuning should provide with dataset."
-        batch_size = self._user_tuning_config["batch_size"]
-        dataset = self._user_tuning_config["dataset"]
-        dataset.dp_world_size = self.dp_world_sizes
-        dataset.dp_rank = self.dp_ranks
+        assert mode == "train"
+        # Do the build process
+        self._build(mode)
+        # Do the planning process
+        self._plan(mode)
+
+        dataset.dp_world_size = self._dp_world_sizes
+        dataset.dp_rank = self._dp_ranks
 
         from .tuner.optimization_tuner import OptimizationTuner
-        self._optimization_tuner = OptimizationTuner(self._user_tuning_config,
+        self._optimization_tuner = OptimizationTuner(self._tuning.to_dict(),
                                                      self._dist_contexts[mode],
                                                      dataset,
                                                      self.inputs_spec,
@@ -245,12 +306,10 @@ def _optimization_tuning(self, mode):
 
         self._optimization_tuner.tune()
 
-        if self._user_tuning_config["run_after_tuning"]:
+        if self._tuning.run_after_tuning:
             # update the strategy
             self._dist_contexts[
                 mode]._strategy = self._optimization_tuner.get_best_config()
-        else:
-            return
 
     def _plan(self, mode):
         if self._planned_mode is None:
@@ -270,15 +329,15 @@ def _plan(self, mode):
             if var.name in block.vars:
                 feed_list.append(block.vars[var.name])
 
-        self.dp_world_sizes = []
-        self.dp_ranks = []
+        self._dp_world_sizes = []
+        self._dp_ranks = []
         for feed_var in feed_list:
             dp_world_size, dp_rank = self._get_input_split_info(
                 feed_var, self._dist_contexts[mode])
-            self.dp_world_sizes.append(dp_world_size)
-            self.dp_ranks.append(dp_rank)
+            self._dp_world_sizes.append(dp_world_size)
+            self._dp_ranks.append(dp_rank)
 
-    def _parallel(self, mode, all_ranks):
+    def _parallel(self, mode, all_ranks=False):
         # Parallelize program based on the planner's results
         # For now, the completer has to be passed to the planner,
         # because we may use it to complete the annotation of the backwarkward and update.
@@ -336,6 +395,11 @@ def _initialize(self, mode):
         if isinstance(place, fluid.CUDAPlace):
             place = fluid.CUDAPlace(ParallelEnv().dev_id)
 
+        if self._strategy.seed:
+            paddle.seed(self._strategy.seed + self._dp_ranks[0])
+            np.random.seed(self._strategy.seed + self._dp_ranks[0])
+            random.seed(self._strategy.seed + self._dp_ranks[0])
+
         if self._dygraph_mode:
             dist_context = self._dist_contexts[mode]
             dist_main_program = self._dist_main_progs[mode][self._cur_rank]
@@ -354,102 +418,299 @@ def _initialize(self, mode):
                 prune_startup_prog = dist_startup_prog._prune(uninitialized)
                 self._executor.run(prune_startup_prog)
 
+            if hasattr(self, "_state_dict") and hasattr(self, "_dist_attr"):
+                self._set_state_dict(mode, self._strict, self._state_dict,
+                                     self._dist_attr)
+
+        if self._strategy.reinit:
+            self._logger.info("NOTE: parameters wiil be re-initialized.")
+            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            self._executor.run(dist_startup_prog)
+
+    def _infer_sample_spec(self, data, batch_size, split):
+        if isinstance(data, paddle.io.IterableDataset):
+            if split is None:
+                input, label = next(iter(data))
+            else:
+                sample = next(iter(data))
+                input = sample[:split]
+                label = sample[split:]
+        elif isinstance(data, paddle.io.Dataset):
+            if split is None:
+                input, label = data[0]
+            else:
+                sample = data[0]
+                input = sample[:split]
+                label = sample[split:]
+        else:
+            raise ValueError(
+                "Data should be a Dataset or IterableDatset, but received {}.".
+                format(type(data).__name__))
+
+        self.inputs_spec = []
+        self.labels_spec = []
+        input_list = to_list(input)
+        label_list = to_list(label)
+
+        def _infer_item_spec(item, name, batch_size, specs):
+            if isinstance(item, np.ndarray):
+                spec = InputSpec.from_numpy(item, name)
+                if batch_size is None:
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
+                spec = InputSpec.from_tensor(item, name)
+                if batch_size is None:
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            else:
+                specs.append(InputSpec([batch_size], type(item), name))
+
+        if input_list is not None:
+            for i, item in enumerate(input_list):
+                assert item is not None, "Receive None input."
+                name = "input" + str(i)
+                _infer_item_spec(item, name, batch_size, self.inputs_spec)
+        if label_list is not None:
+            for i, item in enumerate(label_list):
+                assert item is not None, "Receive None input."
+                name = "label" + str(i)
+                _infer_item_spec(item, name, batch_size, self.labels_spec)
+
+        self.inputs_spec = self._validate_spec(self.inputs_spec)
+        self.labels_spec = self._validate_spec(self.labels_spec)
+
     def fit(self,
             train_data,
+            train_sample_split=None,
             batch_size=1,
             epochs=1,
-            fetches=None,
             steps_per_epoch=None,
+            valid_data=None,
+            valid_sample_split=None,
+            valid_freq=1,
+            valid_steps=None,
             collate_fn=None,
-            use_cache=False,
-            return_numpy=True):
-        # TODO: callbacks
-        # TODO: evaluate after training
-
-        if not self._mode_init_states['train']:
-            raise Exception(
-                "train program is not initialized yet, please call engine.prepare() before calling fit() funtion."
-            )
-
+            callbacks=None):
+        """
+        Trains the model for a fixed number of epochs. If `valid_data` is set,
+        evaluation will be done at the end of each epoch.
+
+        Args:
+            train_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            train_sample_split (int, optional): Each sample of the train dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, train_sample_split specifies how to split these items into 
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of train_data and valid_data if provided. 
+                The user's data will be used directly without batching if set to None. Default: 1.
+            epochs (int, optional): The number of epochs to train the model. Default: 1.
+            steps_per_epoch (int, optional): The total number of steps (batches of samples)
+                is executed in one epoch before stating the next one. If None, it is equal to 
+                the number samples in your dataset divided by the batch size. Default: None.
+            valid_data (Dataset, optional): An instance of paddle paddle.io.Dataset used for
+                evaluation at the end of epoch. No evaluation will be done if set to None. 
+                Default: None. (Unsupported for now)
+            valid_freq (int, optional): Only relevant if valid_data is provided. This specifies 
+                how many training epochs before a new evaluation is performed. Default: 1.
+            valid_sample_split (int, optional): Only relevant if valid_data is provided.
+                Each sample of the valid dataset is assumed to be a (input, label) pair 
+                by default and has two items. If each sample has more than two items, 
+                valid_sample_split specifies how to split these items into input and label.
+                The items before it are input and the left are label. Default: None.
+            valid_steps (int, optional): Only relevant if valid_data is provided.
+                It is the total number of steps (batches of samples) to draw before 
+                stopping validation at the end of every epoch. If None, validation will run until the 
+                `valid_data` dataset is exhausted. The validation will start from the
+                beginning of the dataset at each epoch. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None. 
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during training. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                import paddle.distributed.auto_parallel as auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss() 
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics) 
+                engine.fit(train_dataset,
+                           epochs=2,
+                           batch_size=64)
+        """
         self.mode = 'train'
+        self._infer_sample_spec(train_data, batch_size, train_sample_split)
+        if not self._mode_init_states[self.mode]:
+            self._prepare_single_mode(self.mode)
+
         assert self.mode in self._dist_main_progs, \
             "train model is not ready, please call `engine.prepare()` first."
         train_dataloader = self._create_dataloader(train_data, batch_size,
                                                    epochs, steps_per_epoch,
                                                    collate_fn)
 
-        usr_fetch = self._validate_fetches(fetches)
         fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
-        fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch)
-        lr_scheduler = self.get_lr_scheduler(self.main_program)
+        fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
+        inner_fetch = dict(fetch_loss, **fetch_metrics)
+        usr_fetch = self._validate_fetches(_get_fetches())
+        fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
+        lr_scheduler = self._get_lr_scheduler(self.main_program)
 
+        outputs = defaultdict(list)
         for epoch in range(epochs):
             train_logs = {"epoch: {:d} ": epoch}
             for step, _ in enumerate(train_dataloader):
                 try:
-                    outs = self._executor.run(self.main_program,
-                                              fetch_list=fetch_list,
-                                              use_program_cache=use_cache,
-                                              return_numpy=return_numpy)
-                except fluid.core.EOFException:
+                    outs = self._executor.run(
+                        self.main_program,
+                        fetch_list=fetch_list,
+                        use_program_cache=self._strategy.use_cache,
+                        return_numpy=self._strategy.return_numpy)
+                except core.EOFException:
                     break
-
                 train_logs["step: {:d} "] = step
-                if lr_scheduler is not None and step % self.k_steps == 0:
+                # update lr
+                if lr_scheduler and step % self._k_steps == 0:
                     lr_scheduler.step()
-                    try:
-                        train_logs["lr: {:5e} "] = self._lr_optimizer.get_lr()
-                    except:
-                        train_logs[
-                            "lr: {:5e} "] = self._lr_optimizer._learning_rate.get_lr(
-                            )
+                train_logs["lr: {:5e} "] = self._get_lr(self._lr_optimizer)
                 # inner fetches
                 if fetch_loss:
-                    train_logs["loss: {:9f} "] = outs[0][0]
+                    train_logs["loss: {:8f} "] = outs[0][0]
+                    outputs["loss"].append(outs[0][0])
+                # Metric
+                if fetch_metrics:
+                    metric_out = outs[len(fetch_loss):len(inner_fetch)]
+                    for metric in self._metrics:
+                        metric.update(*metric_out)
+                        results = metric.accumulate()
+                        for i, res in enumerate(to_list(results)):
+                            train_logs[metric.name()[i] + ": {:8f} "] = res
+                            outputs[metric.name()[i]].append(outs[0][0])
                 # user fetches
-                user_outs = outs[len(fetch_loss):]
-                user_fetch_list = fetch_list[len(fetch_loss):]
+                user_outs = outs[len(inner_fetch):]
+                user_fetch_list = fetch_list[len(inner_fetch):]
                 for i, out in enumerate(user_outs):
                     train_logs[fetch_map[user_fetch_list[i]] + ": {}"] = out
                 # logger
                 string = '[train] ' + ''.join(list(train_logs.keys()))
                 self._logger.info(string.format(*list(train_logs.values())))
 
+            if valid_data and epoch % valid_freq == 0:
+                self.evaluate(valid_data, valid_sample_split, batch_size,
+                              valid_steps, collate_fn, callbacks)
+                self._switch_mode("train")
+
+            self._reset_metrics()
+        return outputs
+
     def evaluate(self,
-                 eval_data,
+                 valid_data,
+                 valid_sample_split=None,
                  batch_size=1,
-                 fetches=None,
+                 steps=None,
                  collate_fn=None,
-                 use_cache=False,
-                 return_numpy=True):
+                 callbacks=None):
+        """
+        Evaluate the loss and metrics of the model on evaluation data.
+
+        Args:
+            eval_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            eval_sample_split (int, optional): Each sample of the eval dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, eval_sample_split specifies how to split these items into 
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of eval_data. The user's data will
+                be used directly without batching if set to None. Default: 1.
+            steps (int, optional): It is the total number of steps (batches of samples) to draw before 
+                stopping evaluation. If None, evaluation will run until the `valid_data` dataset is exhausted. 
+                The evaluation will start from the beginning of the dataset in each run. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None.
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during evaling. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                import paddle.distributed.auto_parallel as auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                valid_dataset = MNIST(mode='test', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss() 
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, metrics=metrics) 
+                engine.evaluate(valid_dataset, batch_size=64)
+
+        """
         self.mode = 'eval'
+        self._infer_sample_spec(valid_data, batch_size, valid_sample_split)
         if not self._mode_init_states[self.mode]:
             self._prepare_single_mode(self.mode)
 
         assert self.mode in self._dist_main_progs, \
             "eval model is not ready, please call `engine.prepare()` first."
-        eval_dataloader = self._create_dataloader(eval_data,
-                                                  batch_size,
-                                                  collate_fn=collate_fn)
+        valid_dataloader = self._create_dataloader(valid_data,
+                                                   batch_size,
+                                                   steps_per_epoch=steps,
+                                                   collate_fn=collate_fn)
 
-        usr_fetch = self._validate_fetches(fetches)
         fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
         fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
         inner_fetch = dict(fetch_loss, **fetch_metrics)
+        usr_fetch = self._validate_fetches(_get_fetches())
         fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
 
-        for step, _ in enumerate(eval_dataloader):
-            eval_logs = {"step: {:d} ": step}
+        outputs = defaultdict(list)
+        for step, _ in enumerate(valid_dataloader):
             try:
-                outs = self._executor.run(self.main_program,
-                                          fetch_list=fetch_list,
-                                          use_program_cache=use_cache,
-                                          return_numpy=return_numpy)
-            except fluid.core.EOFException:
+                outs = self._executor.run(
+                    self.main_program,
+                    fetch_list=fetch_list,
+                    use_program_cache=self._strategy.use_cache,
+                    return_numpy=self._strategy.return_numpy)
+            except core.EOFException:
                 break
+            eval_logs = {"step: {:d} ": step}
             # inner fetches
             if fetch_loss:
-                eval_logs["loss: {:9f} "] = outs[0][0]
+                eval_logs["loss: {:8f} "] = outs[0][0]
+                outputs["eval_loss"].append(outs[0][0])
             # Metric
             if fetch_metrics:
                 metric_out = outs[len(fetch_loss):len(inner_fetch)]
@@ -457,8 +718,9 @@ def evaluate(self,
                     metric.update(*metric_out)
                     results = metric.accumulate()
                     for i, res in enumerate(to_list(results)):
-                        eval_logs[metric.name()[i] + ": {:9f} "] = res
-            # usr fetches
+                        eval_logs[metric.name()[i] + ": {:8f} "] = res
+                        outputs["eval_" + metric.name()[i]].append(res)
+            # user fetches
             usr_outs = outs[len(inner_fetch):]
             usr_fetch_list = fetch_list[len(inner_fetch):]
             for i, out in enumerate(usr_outs):
@@ -466,15 +728,61 @@ def evaluate(self,
             # logger
             string = '[eval] ' + ''.join(list(eval_logs.keys()))
             self._logger.info(string.format(*list(eval_logs.values())))
+        self._reset_metrics()
+        return outputs
 
     def predict(self,
                 test_data,
+                test_sample_split=None,
                 batch_size=1,
-                fetches=None,
+                steps=None,
                 collate_fn=None,
-                use_cache=False,
-                return_numpy=True):
+                callbacks=None):
+        """
+        Compute the output predictions on testing data.
+
+        Args:
+            test_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            test_sample_split (int, optional): Each sample of the test dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, test_sample_split specifies how to split these items into 
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of test_data. The user's data will
+                be used directly without batching if set to None. Default: 1.
+            steps (int, optional): It is the total number of steps (batches of samples) to draw before 
+                stopping predict. If None, predict will run until the `test_data` dataset is exhausted. 
+                The predict will start from the beginning of the dataset in each run. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None.
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during testing. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                import paddle.distributed.auto_parallel as auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                valid_dataset = MNIST(mode='test', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+
+                engine = auto.Engine(model) 
+                engine.predict(valid_dataset, batch_size=64)
+        """
         self.mode = 'predict'
+        self._infer_sample_spec(test_data, batch_size, test_sample_split)
         if not self._mode_init_states[self.mode]:
             self._prepare_single_mode(self.mode)
 
@@ -482,22 +790,24 @@ def predict(self,
             "predict model is not ready, please call `engine.prepare()` first."
         test_dataloader = self._create_dataloader(test_data,
                                                   batch_size,
+                                                  steps_per_epoch=steps,
                                                   collate_fn=collate_fn)
 
-        usr_fetch = self._validate_fetches(fetches)
         fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"])
+        usr_fetch = self._validate_fetches(_get_fetches())
         fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch)
 
         outputs = []
         for step, _ in enumerate(test_dataloader):
-            predict_logs = {"step: {:d} ": step}
             try:
-                outs = self._executor.run(self.main_program,
-                                          fetch_list=fetch_list,
-                                          use_program_cache=use_cache,
-                                          return_numpy=return_numpy)
-            except fluid.core.EOFException:
+                outs = self._executor.run(
+                    self.main_program,
+                    fetch_list=fetch_list,
+                    use_program_cache=self._strategy.use_cache,
+                    return_numpy=self._strategy.return_numpy)
+            except core.EOFException:
                 break
+            predict_logs = {"step: {:d} ": step}
             outputs.append(outs[:len(fetch_outputs)])
             for i, out in enumerate(outs):
                 predict_logs[fetch_map[fetch_list[i]] + ": {}"] = out
@@ -507,6 +817,11 @@ def predict(self,
 
         return outputs
 
+    def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
+        self.mode = 'train'
+        self._infer_sample_spec(tune_data, batch_size, tune_sample_split)
+        self._optimization_tuning(self.mode, tune_data, batch_size)
+
     def _create_dataloader(self,
                            dataset,
                            batch_size,
@@ -514,10 +829,10 @@ def _create_dataloader(self,
                            steps_per_epoch=None,
                            collate_fn=None):
 
-        if self.strategy.gradient_merge and batch_size is not None:
-            assert batch_size % self.k_steps == 0, \
-                "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self.k_steps)
-            batch_size //= self.k_steps
+        if self._strategy.gradient_merge and batch_size is not None:
+            assert batch_size % self._k_steps == 0, \
+                "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(batch_size, self._k_steps)
+            batch_size //= self._k_steps
 
         dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
         dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
@@ -557,9 +872,9 @@ def _create_dataloader(self,
                 epochs,
                 steps_per_epoch,
                 collate_fn,
-                data_parallel_world_size=self.dp_world_sizes,
-                data_parallel_rank=self.dp_ranks,
-                split_data=self.strategy.split_data)
+                data_parallel_world_size=self._dp_world_sizes,
+                data_parallel_rank=self._dp_ranks,
+                split_data=self._strategy.split_data)
 
         # move read op from the end of program to the start of program
         new_op_size = len(dist_main_block.ops)
@@ -580,9 +895,7 @@ def _create_dataloader(self,
 
     def _validate_spec(self, specs):
         specs = to_list(specs)
-        self.k_steps = 1
-        if self.strategy.gradient_merge:
-            self.k_steps = self.strategy.gradient_merge_configs['k_steps']
+        self._k_steps = self._strategy.gradient_merge.k_steps
         if specs is not None:
             for i, spec in enumerate(specs):
                 assert isinstance(spec, InputSpec)
@@ -590,11 +903,11 @@ def _validate_spec(self, specs):
                     raise ValueError(
                         "Requires Input[{}].name != None, but receive `None` with {}."
                         .format(i, spec))
-                if self.k_steps > 1:
+                if self._k_steps > 1:
                     shape = list(spec.shape)
-                    assert shape[0] % self.k_steps == 0, \
-                        "Requires batch_size[{}] to be divisible by k_steps[{}].".format(spec.shape[0], self.k_steps)
-                    shape[0] //= self.k_steps
+                    assert shape[0] % self._k_steps == 0, \
+                        "Requires batch_size[{}] to be divisible by k_steps[{}].".format(spec.shape[0], self._k_steps)
+                    shape[0] //= self._k_steps
                     spec.shape = shape
         return specs
 
@@ -655,38 +968,95 @@ def _set_recompute_ckpts(self):
         # NOTE hack to enable recompute in engine api for GPT-3
         # TODO support more PaddleNLP/CV models here
 
-        config = self.strategy.recompute_configs
+        recompute = self._strategy.recompute
 
         # extract ckpts by specific model
-        if isinstance(self.model, paddle.nn.Layer):
+        if isinstance(self._model, paddle.nn.Layer):
             if hasattr(
-                    self.model, "gpt"
-            ) and self.model.__class__.__name__ == 'GPTForPretraining':
-                exact_ckpts = self.model.gpt.checkpoints
+                    self._model, "gpt"
+            ) and self._model.__class__.__name__ == 'GPTForPretraining':
+                exact_ckpts = self._model.gpt.checkpoints
             else:
-                exact_ckpts = config["checkpoints"]
+                exact_ckpts = recompute.checkpoints
         else:
-            exact_ckpts = config["checkpoints"]
+            exact_ckpts = recompute.checkpoints
 
         # modify strategy
-        if self.strategy.recompute:
-            config["checkpoints"] = exact_ckpts[:]
-            self.strategy.recompute_configs = config
+        if recompute.enable:
+            recompute.checkpoints = exact_ckpts[:]
             logs = {
-                'Model Class': self.model.__class__.__name__,
+                'Model Class': self._model.__class__.__name__,
                 'Applied Recompute ckpts': exact_ckpts
             }
             self._logger.info(logs)
 
     def _validate_opt(self, optimizer):
-        optimizer._parameter_list = None
-        optimizer._param_groups = None
+        if optimizer is not None:
+            optimizer._parameter_list = None
+            optimizer._param_groups = None
         return optimizer
 
-    def save(self, path, training=True, mode=None):
-        if not mode:
-            mode = self.mode
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def _switch_mode(self, mode):
+        self.mode = mode
+        self._initialize(mode)
+
+    def _set_state_dict(self, mode, strict, state_dict, dist_attr):
+        program = self._dist_main_progs[mode][self._cur_rank]
+        dist_context = self._dist_contexts[mode]
+        cur_dist_attr = get_dist_attr(program, dist_context)
+        converter = Converter(state_dict, dist_attr, cur_dist_attr)
+        state_dict = converter.convert(strict=strict)
+        program.set_state_dict(state_dict)
+
+    def save(self, path, training=True):
+        """  
+        Saves the model, parameters, optimizer state to path. 
+        If `training` is set to False, only inference model will be saved.
+
+        Args:
+            path (str): The file prefix to save model. The format
+                is 'dirname/file_prefix' or 'file_prefix'. if empty str.
+                A exception will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. If `training` is set to True, the optimzer state
+                will be saved. Otherwise, only the model and parameters are saved.
+                This function will silently overwrite existing file at the target
+                location. Default: True.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.vision.transforms as T
+                import paddle.distributed.auto_parallel as auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss() 
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics) 
+                engine.fit(train_dataset,
+                           epochs=1,
+                           batch_size=64)
+                engine.save("./my_model")
 
+        """
         if training:
             assert 'train' in self._serial_main_progs, \
                 "training model is not ready, please call `engine.prepare()` first."
@@ -698,7 +1068,7 @@ def save(self, path, training=True, mode=None):
                              dist_main_program=dist_main_prog,
                              dist_context=dist_context)
         else:
-            assert mode, "Please set the 'mode' you want to save."
+            mode = "predict"
             feed_vars = self._feed_vars[mode]['inputs']
             fetch_vars = self._fetch_vars[mode]['outputs']
             dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
@@ -708,18 +1078,59 @@ def save(self, path, training=True, mode=None):
                                              self._executor,
                                              program=dist_main_prog)
 
-    def load(self, path, strict=True, load_optimizer=True, mode=None):
-        if not mode:
-            mode = self.mode
-        assert mode, "Please set the 'mode' you want to load."
+    def load(self, path, strict=True, load_optimizer=True):
+        """
+        Load the stored model, parameters and optimizer states.
+
+        Args:
+            path (str): The prefix of files storing the model states and
+                optimizer states. 
+            strict (bool, optional): Whether to skip the loading of mismatch
+                parameter or raise an error when mismatch happens (not found
+                the parameter in file storing model states of or receives a
+                mismatch shape). Default: False.
+            load_optimizer (bool, optional): If True, the stored optimizer
+                states is restored. Otherwise, the optimizer states is intialized
+                from scratch. Default: False.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.vision.transforms as T
+                import paddle.distributed.auto_parallel as auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss() 
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics) 
+                engine.fit(train_dataset,
+                           epochs=1,
+                           batch_size=64)
+                engine.save("./my_model")
+                engine.load("./my_model")
 
-        dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
-        dist_context = self._dist_contexts[mode]
-        self._saver.load(path, dist_main_prog, dist_context, strict,
-                         load_optimizer)
+        """
+        self._strict = strict
+        self._state_dict, self._dist_attr = self._saver.load(
+            path, load_optimizer)
+        return self._state_dict, self._dist_attr
 
     @staticmethod
-    def get_lr_scheduler(program):
+    def _get_lr_scheduler(program):
         lr_sheduler = None
         if hasattr(program, 'lr_sheduler'):
             from paddle.optimizer.lr import LRScheduler
@@ -727,6 +1138,20 @@ def get_lr_scheduler(program):
             assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
         return lr_sheduler
 
+    def _get_lr(self, optimizer):
+        if isinstance(optimizer, paddle.optimizer.Optimizer):
+            return optimizer.get_lr()
+        elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer):
+            if isinstance(optimizer._learning_rate, float):
+                return optimizer._learning_rate
+            else:
+                return optimizer._learning_rate()
+        else:
+            raise TypeError(
+                    "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
+                        " or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(type(optimizer))
+                )
+
     @property
     def mode(self):
         return self._mode
@@ -758,3 +1183,11 @@ def serial_startup_program(self):
     @property
     def fetch_vars(self):
         return self._fetch_vars[self.mode]
+
+    @property
+    def inputs(self):
+        return self.inputs_spec
+
+    @property
+    def labels(self):
+        return self.labels_spec
diff --git a/python/paddle/distributed/auto_parallel/helper.py b/python/paddle/distributed/auto_parallel/helper.py
index 72a8b7df7c8e8..6bc177efc9de9 100644
--- a/python/paddle/distributed/auto_parallel/helper.py
+++ b/python/paddle/distributed/auto_parallel/helper.py
@@ -19,13 +19,13 @@
 
 from paddle.nn import Layer
 from paddle.jit import to_static, not_to_static
-from paddle.distributed.utils.log_utils import get_logger
 from paddle.fluid.framework import Operator, Parameter, _non_static_mode
 from paddle.fluid.framework import program_guard
 from paddle.fluid.executor import global_scope
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 
 from .utils import to_list
+from .utils import get_logger
 from .converter import Converter
 
 
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index e06120a7e19d0..ad3078c449048 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -12,101 +12,198 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy
-import copy
-import paddle
-import paddle.fluid.core as core
-from paddle.fluid.framework import Variable
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid import core
+from .process_mesh import ProcessMesh
+from .process_mesh import get_current_process_mesh
+from .process_mesh import set_current_process_mesh
+from .process_mesh import reset_current_process_mesh
 from .dist_context import get_default_distributed_context
 from .dist_tensor import DistributedTensor
-from .dist_op import DistributedModule
-from .dist_attribute import TensorDistributedAttribute
-from .dist_attribute import OperatorDistributedAttribute
+from .dist_op import DistributedOperatorHelper
+from .utils import verify_shard_spec, convert_to_dims_mapping
 
 
-def _static_mode_check():
-    if _non_static_mode():
-        raise RuntimeError("Auto-parallel only supports static mode for now, "
-                           "please use paddle.enable_static() first.")
-
-
-def shard_tensor(x, dist_attr=None):
+def shard_tensor(x, process_mesh=None, shard_spec=None):
     """
-    Add distributed attributes for a tensors.
+    Shard a tensor on a process mesh according to the shard specification.
 
     Args:
         x (Tensor): the tensor to be sharded.
-        dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
-            "process_mesh": a nested list an to describe the mesh topology of logical processes.
-            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension
-                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`,
-                where -1 means that tensor dimension is not split.
-            Both process_mesh and dims_mapping are optional and users can specify as need.
+        process_mesh (ProcessMesh, optional): An instance of ProcessMesh describes a mesh
+            topology of the used logical processes where the tensor is sharded. If it is None,
+            the found current process mesh will be used. And an error will be raised if the 
+            current process mesh cannot be found. Default: None.
+        shard_spec (list, optional): a list to describe the sharding mapping between `x` and `process_mesh`,
+            which means the dimension `i` of `x` is split across the dimension `shard_spec[i]` of `process_mesh`,
+            where `None` means that tensor dimension is not split. For example, given a tensor wih 
+            the shape [6, 12] and a process mesh with the shape [2, 3] and the dimension names ["x", "y"]:
+                If `shard_spec=["x", "y"]`, each shard of the tensor will have a shape [3, 4];
+                If `shard_spec=["y", "x"]`, each shard of the tensor will have a shape [2, 6];
+                If `shard_spec=["x", None]`, each shard of the tensor will have a shape [3, 12];
+                If `shard_spec=[None, "x"]`, each shard of the tensor will have a shape [6, 4];
+                If `shard_spec=["y", None]`, each shard of the tensor will have a shape [2, 12];
+                If `shard_spec=[None, "y"]`, each shard of the tensor will have a shape [6, 4];
+                If `shard_spec=[None, None]`, each shard of the tensor will have a shape [6, 12];
+        If the `shard_spec` is None, the tensor will be replicated across all the processes of `process_mesh`.
+        In the above example, the `shard_spec=None` is same as 'shard_spec=[None, None]'. Defaults: None.
 
     Returns:
-        Tensor: the tensor `x` annotated with distributed attributes.
+        Tensor: the tensor `x` annotated with sharding information. 
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
+            import paddle.distributed.auto_parallel as auto 
 
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
             x = paddle.ones([4, 6])
-            dist.shard_tensor(x, dist_attr={"process_mesh": [[0, 1], [2, 3]],
-                                            "dims_mapping": [0, -1]})
+            shard_spec = ["x", "y"]
+            auto.shard_tensor(x, mesh, shard_spec)
 
     """
-    _static_mode_check()
-    assert dist_attr is None or isinstance(dist_attr, (dict, TensorDistributedAttribute)), \
-        "The type of dist_attr must be None, dict or TensorDistributedAttribute."
-    dist_tensor = DistributedTensor(x, dist_attr)
-    dist_tensor.dist_attr.mark_annotated_as(dist_attr)
+
+    if process_mesh is not None:
+        assert isinstance(process_mesh, ProcessMesh), \
+            "Argument process_mesh {} is not an instance of ProcessMesh".format(process_mesh)
+    else:
+        process_mesh = get_current_process_mesh()
+        assert  process_mesh is not None, \
+            "Specify the process mesh argument or use ProcessMesh context manager first."
+    assert isinstance(shard_spec, list), \
+        "Argument shard_spec {} is not an instance of list".format(shard_spec)
+    dist_tensor = DistributedTensor(x)
+    serial_tensor = dist_tensor.serial_tensor
+    dist_tensor.dist_attr.process_mesh = process_mesh
+    if serial_tensor.type == core.VarDesc.VarType.READER \
+        or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
+        or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+        tensor_shape = []
+    else:
+        tensor_shape = serial_tensor.shape
+    if shard_spec is not None:
+        assert verify_shard_spec(shard_spec, tensor_shape, process_mesh), \
+            "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+                serial_tensor.name, shard_spec, tensor_shape, process_mesh)
+        dist_tensor.dist_attr.dims_mapping = convert_to_dims_mapping(
+            shard_spec, process_mesh)
+    if process_mesh is not None:
+        dist_tensor.dist_attr.mark_annotated("process_mesh")
+    if shard_spec is not None:
+        dist_tensor.dist_attr.mark_annotated("dims_mapping")
     default_dist_ctx = get_default_distributed_context()
     default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
+    dist_tensor = default_dist_ctx.get_dist_tensor_for_program(x)
     return x
 
 
-def shard_op(op_fn, dist_attr=None):
+def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
     """
-    Call a functioin and add distributed attributes for ops added by the function.
+    Shard an operation on a process mesh according to its input and output shard specification.
 
     Args:
-        op_fn (callable): a callable operator or module to be sharded.
-        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into
-            two categories. The first category decsribes the distributed attributes shared by all inputs and
-            outputs, and only `process_mesh` can be specified now. The second category describes distributed
-            attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
-            optional and users can specify them as need. Note that `process_mesh` for operators must be the
-            same as these process_meshes for inputs and outputs.
+        op (Callable): a callable operator or module to be sharded.
+        process_mesh (ProcessMesh, optional): An instance of ProcessMesh describes a mesh
+            topology of the used logical processes where the op is sharded. All of its inputs and
+            outputs are sharded by this process mesh. If it is None, the found current process mesh
+            will be used. And an error will be raised if the current process mesh cannot be found.
+            Default: None.
+        in_shard_specs (list of list, optional): a list of list to describe the sharding specifications
+            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the correspoinding input
+            and `process_mesh`. If one item is None, the cooresponding input is replicated across all processes
+            If it is None, all inputs are replicated accross all processes. Note that the lenght of the 
+            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
+            Default: None.
+        out_shard_specs (list of list, optional): a list of list to describe the sharding specifications
+            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the correspoinding output
+            and `process_mesh`. If one item is None, the cooresponding output is replicated across all processes
+            If it is None, all outputs are replicated accross all processes. Note that the lenght of the 
+            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
+            Default: None. Default: None.
 
     Returns:
-        list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
+        Outputs of `op`, each of which is annotated with sharding information.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
-
+            import paddle.distributed.auto_parallel as auto 
+            
             x = paddle.ones([4, 6])
             y = paddle.zeros([4, 6])
-            dist_add = dist.shard_op(paddle.add,
-                                     dist_attr={
-                                         "process_mesh": [[2, 3, 1], [0, 4, 5]],
-                                         x: {"dims_mapping": [-1, 0]},
-                                         y: {"dims_mapping": [0, -1]}
-                                     })
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+            dist_add = auto.shard_op(paddle.add,
+                                     in_shard_specs=[["x", "y"], ["y", None]],
+                                     out_shard_specs=[[None, "x"]])
             dist_add(x, y)
 
     """
-    _static_mode_check()
-    assert dist_attr is None or isinstance(dist_attr, (dict, OperatorDistributedAttribute)), \
-        "The type of dist_attr must be dict or OperatorDistributedAttribute."
-    dist_module = DistributedModule(op_fn, dist_attr)
-    return dist_module
+
+    if process_mesh is not None:
+        assert isinstance(process_mesh, ProcessMesh), \
+            "Argument process_mesh {} is not an instance of ProcessMesh".format(process_mesh)
+    else:
+        process_mesh = get_current_process_mesh()
+        assert  process_mesh is not None, \
+            "Specify the process mesh argument or use ProcessMesh context manager first."
+    in_dims_mappings = []
+    if in_shard_specs is not None:
+        assert all((isinstance(shard_spec, list) or shard_spec is None) for shard_spec in in_shard_specs), \
+            "in_shard_spec {} is not a list of list or None".format(in_shard_specs)
+        for shard_spec in in_shard_specs:
+            if shard_spec is not None:
+                in_dims_mappings.append(
+                    convert_to_dims_mapping(shard_spec, process_mesh))
+            else:
+                in_dims_mappings.append(None)
+    out_dims_mappings = []
+    if out_shard_specs is not None:
+        assert all((isinstance(shard_spec, list) or shard_spec is None) for shard_spec in out_shard_specs), \
+            "out_shard_spec {} is not a list of list or None".format(out_shard_specs)
+        for shard_spec in out_shard_specs:
+            if shard_spec is not None:
+                out_dims_mappings.append(
+                    convert_to_dims_mapping(shard_spec, process_mesh))
+            else:
+                out_dims_mappings.append(None)
+    op = DistributedOperatorHelper(op, process_mesh, in_dims_mappings,
+                                   out_dims_mappings)
+    return op
+
+
+def recompute(op):
+
+    class RecomputeOperator:
+
+        def __init__(self, op):
+            self._op = op
+
+        def __call__(self, *args, **kwargs):
+            default_prog = paddle.fluid.default_main_program()
+            cur_block = default_prog.current_block()
+            op_size = len(cur_block.ops)
+            output = self._op(*args, **kwargs)
+            new_op_size = len(cur_block.ops)
+
+            for idx in range(op_size, new_op_size):
+                op = cur_block.ops[idx]
+                op._set_attr("is_recompute@auto_parallel", True)
+
+            return output
+
+    return RecomputeOperator(op)
+
+
+_g_fetched_tensors = {}
+
+
+def fetch(tensor, name=None):
+    if name is None:
+        _g_fetched_tensors[tensor.name] = tensor
+    else:
+        _g_fetched_tensors[name] = tensor
+
+
+def _get_fetches():
+    return _g_fetched_tensors
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 6e348b0c1df5e..63fc4660c7f04 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -42,6 +42,7 @@
 from .utils import set_grad_var_shape
 from .utils import print_program_with_dist_attr
 from .utils import SerialProgramInfo
+from .utils import get_logger
 from .reshard import Resharder
 from .cluster import Cluster
 from .mapper import mapping
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index f452f6a3cf977..b83a19b512ef8 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -22,7 +22,6 @@
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import _non_static_mode, unique_name
 from paddle.distributed.passes import new_pass
-from paddle.distributed.utils.log_utils import get_logger
 
 from .reshard import Resharder
 from .partitioner import Partitioner
@@ -31,6 +30,7 @@
 from .dist_loader import NonIterableGeneratorLoader
 from .utils import make_data_unshard, set_grad_var_shape
 from .utils import print_program_with_dist_attr, to_list
+from .utils import get_logger
 from .process_group import get_all_process_groups, get_world_process_group
 from .dist_context import DistributedContext, get_default_distributed_context
 
@@ -160,8 +160,8 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
 
         # apply quantization pass
         # The pass can be applied when mode must be 'train'
-        if self._mode == 'train' and self._strategy.qat:
-            config = copy.deepcopy(self._strategy.qat_configs)
+        if self._mode == 'train' and self._strategy.qat.enable:
+            config = copy.deepcopy(self._strategy.qat.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             auto_parallel_quantization_pass = new_pass(
@@ -176,8 +176,8 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
         # apply amp pass
         # FIXME we disenable amp for eval since it has a little bug with
         # eval program and which will be fixed in future
-        if self._mode == 'train' and self._strategy.amp:
-            config = copy.deepcopy(self._strategy.amp_configs)
+        if self._mode == 'train' and self._strategy.amp.enable:
+            config = copy.deepcopy(self._strategy.amp.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["loss"] = loss
@@ -195,8 +195,8 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
 
         # apply recompute pass
         # recompute is then train-only optimization
-        if self._mode == "train" and self._strategy.recompute:
-            config = copy.deepcopy(self._strategy.recompute_configs)
+        if self._mode == "train" and self._strategy.recompute.enable:
+            config = copy.deepcopy(self._strategy.recompute.to_dict())
             config["dist_context"] = self._dist_context
             config["no_grad_set"] = None
             config["loss"] = loss
@@ -217,12 +217,12 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
         config = {}
         config["dist_context"] = self._dist_context
         config["global_rank"] = rank
-        config["use_sharding"] = self._strategy.sharding
+        config["use_sharding"] = self._strategy.sharding.enable
         dp_pass = new_pass("auto_parallel_data_parallel_optimization", config)
         dp_pass.apply([main_program], [startup_program], self._pass_context)
 
-        if self._strategy.sharding:
-            config = copy.deepcopy(self._strategy.sharding_configs)
+        if self._strategy.sharding.enable:
+            config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["global_rank"] = rank
@@ -234,7 +234,7 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
 
         # GradClip is train-only optimization
         if self._mode == "train":
-            config = copy.deepcopy(self._strategy.sharding_configs)
+            config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["rank_id"] = rank
@@ -244,8 +244,8 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
                                           self._pass_context)
 
         # gradient_merge is then train-only optimization
-        if self._mode == "train" and self._strategy.gradient_merge:
-            config = copy.deepcopy(self._strategy.gradient_merge_configs)
+        if self._mode == "train" and self._strategy.gradient_merge.enable:
+            config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             auto_parallel_gradient_merge_pass = new_pass(
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index ab1d68bbf8ea0..14ce5ea75b10c 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -12,86 +12,90 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy
+import numpy as np
 import copy
+import paddle
 
+# Use to store the previous and current process mesh
+_g_previous_process_mesh = None
+_g_current_process_mesh = None
 
-def _get_nested_list_shape(nested_list):
-    """
-    Get the shape of a nested_list.
-    """
-    result = []
-    while isinstance(nested_list, list):
-        result.append(len(nested_list))
-        nested_list = nested_list[0]
-    return result
 
+def get_current_process_mesh():
+    global _g_current_process_mesh
+    return _g_current_process_mesh
 
-def _flatten_nested_list(nested_list):
-    """
-    Get a list of all items in a nested_list.
-    Ref: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
-    """
-    result = numpy.array(nested_list).flatten().tolist()
-    return result
 
+def set_current_process_mesh(process_mesh):
+    global _g_previous_process_mesh
+    global _g_current_process_mesh
+    _g_previous_process_mesh = _g_current_process_mesh
+    _g_current_process_mesh = process_mesh
 
-class ProcessMesh(object):
-    r"""
-    The class `Processmesh` describes the topology of logical processes.
-    A mesh is an N-dimensional array. The shape of the N-dimensional
-    array represents the topology of logical processes and every
-    element of the N-dimensional array represent a logical process. For
-    example, the 2-dimensional array [[2, 4, 5], [0, 1, 3]]
-    illustrates six logical processes organized as the topology [2, 3],
-    i.e., the shape of the 2-dimensional array. With the above topology,
-    there are two parallel groups, where the first parallel group has a
-    parallel degree of 2 and the second one has a parallel degree of 3.
-    And the first logical process is the one with id=2.
 
-    Args:
-        mesh (list): an N-dimensional array (nested list) describes the toplogy
-            of logical processes. The shape of the N-dimensional array
-            represents the topology of logical processes and every
-            element of the N-dimensional array represents a logical process.
+def reset_current_process_mesh():
+    global _g_previous_process_mesh
+    global _g_current_process_mesh
+    _g_current_process_mesh = _g_previous_process_mesh
 
-    Returns:
-        None
 
-    Raises:
-        ValueError: If `mesh` is not an instance of list.
+class ProcessMesh(object):
+    """
+    The `Processmesh` object describes the topology of the used processes. 
 
+    Args:
+        mesh (list|numpy.array): an n-dimensional array describes the toplogy
+            of the processes.
+        dim_names (list, optional): the i-th element of this list gives the name of the
+            i-th dimension of the mesh.
+    
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
-
-            paddle.enable_static()
-
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-            assert mesh.topology == [2, 3]
-            assert mesh.processes == [2, 4, 5, 0, 1, 3]
+            
+            mesh = auto.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+            assert mesh.shape == [2, 3]
+            assert mesh.processe_ids == [2, 4, 5, 0, 1, 3]
 
     """
 
-    def __init__(self, mesh):
-        if mesh is None or not isinstance(mesh, list):
-            raise ValueError('mesh must be an instance of list.')
-
-        processes = _flatten_nested_list(mesh)
-
-        assert all(isinstance(p, int) for p in processes), \
-            ("All elements of mesh must be integer")
-
-        assert min(processes) >= 0, ('All elements of mesh must be >= 0.')
-
-        unique_processes = set(processes)
-        assert len(unique_processes) == len(processes), (
-            'All elements of mesh must be unique.')
-
-        self._topology = _get_nested_list_shape(mesh)
-        self._processes = processes
+    def __init__(self, mesh=None, dim_names=None, shape=None, process_ids=None):
+        # Use shape and process_ids just for compatibility
+        # Users should not use these directly
+        if mesh is None:
+            assert shape is not None
+            assert process_ids is not None
+            mesh = np.array(process_ids).reshape(shape)
+
+        if not isinstance(mesh, list) and \
+           not isinstance(mesh, np.ndarray):
+            raise ValueError(
+                'The mesh must be an instance of list or np.ndarray.')
+        if isinstance(mesh, list):
+            mesh = np.array(mesh)
+
+        self._mesh = mesh
+        self._shape = list(self._mesh.shape)
+        self._process_ids = self._mesh.flatten().tolist()
+
+        assert all(isinstance(p, int) for p in self._process_ids), \
+            ("All elements of the mesh must be integer")
+        assert min(
+            self._process_ids) >= 0, ('All elements of the mesh must be >= 0.')
+        unique_process_ids = set(self._process_ids)
+        assert len(unique_process_ids) == len(
+            self._process_ids), ('All elements of the mesh must be unique.')
+
+        if dim_names is not None:
+            assert len(dim_names) == len(self._shape), \
+                ("The length of dims_names must be same as the shape of the mesh.")
+            self._dim_names = copy.deepcopy(dim_names)
+        else:
+            self._dim_names = ["d" + str(i) for i in range(len(self._shape))]
+        unique_dim_names = set(self._dim_names)
+        assert len(unique_dim_names) == len(self._dim_names), (
+            'All dim_names {} must be unique.'.format(dim_names))
 
         # Store all process meshes
         from .dist_context import get_default_distributed_context
@@ -103,31 +107,117 @@ def __init__(self, mesh):
         pg0.add_ranks(self.processes)
 
     @property
-    def topology(self):
-        r"""
-        Get the topology of logical processes belonging to this ProcessMesh.
-        This is the shape of `mesh` used to initialized this ProcessMesh.
+    def shape(self):
+        """
+        Get the shape of this ProcessMesh.
         """
-        return self._topology
+        return self._shape
 
     @property
-    def processes(self):
-        r"""
-        Get a list of all processes belonging to this ProcessMesh.
+    def process_ids(self):
+        """
+        Get the process ids belonging to this ProcessMesh.
         """
-        return self._processes
+        return self._process_ids
+
+    @property
+    def dim_names(self):
+        """
+        Get the dimension names of this ProcessMesh.
+        """
+        return self._dim_names
 
     @property
     def ndim(self):
-        r"""
-        Get the number of dimension of ProcessMesh.
         """
-        return len(self._topology)
+        Get the number of dimension of this ProcessMesh.
+        """
+        return len(self._shape)
+
+    @property
+    def mesh(self):
+        """
+        Get the underlying mesh of ProcessMesh.
+        """
+        return self._mesh
+
+    @property
+    def topology(self):
+        return self._shape
+
+    @property
+    def processes(self):
+        return self._process_ids
+
+    def __getitem__(self, index):
+        if isinstance(index, tuple):
+            new_dim_names = []
+            for i, item in enumerate(index):
+                if isinstance(item, slice):
+                    new_dim_names.append(self._dim_names[i])
+            new_mesh = self._mesh[index]
+            if new_mesh.shape:
+                return ProcessMesh(new_mesh, new_dim_names)
+            else:
+                # Wrap a scalar into a list but without dim_names
+                return ProcessMesh([new_mesh])
+        elif isinstance(index, slice):
+            new_mesh = self._mesh[index]
+            new_dim_names = self._dim_names
+            return ProcessMesh(new_mesh, new_dim_names)
+        else:
+            new_mesh = self._mesh[index]
+            new_dim_names = self._dim_names[1:]
+            return ProcessMesh(new_mesh, new_dim_names)
+
+    def __enter__(self):
+        set_current_process_mesh(self)
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        self._old_var_names = list(cur_block.vars.keys())
+        self._old_op_size = len(cur_block.ops)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        from .dist_tensor import DistributedTensor
+        from .dist_op import DistributedOperator
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        new_var_names = list(cur_block.vars.keys())
+        new_op_size = len(cur_block.ops)
+        from .dist_context import get_default_distributed_context
+        default_dist_ctx = get_default_distributed_context()
+        for name in new_var_names:
+            if name not in self._old_var_names:
+                tensor = cur_block.vars[name]
+                dist_tensor = default_dist_ctx.get_dist_tensor_for_program(
+                    tensor)
+                if dist_tensor is None:
+                    dist_tensor = DistributedTensor(cur_block.vars[name],
+                                                    {"process_mesh": self})
+                    dist_tensor.dist_attr.mark_annotated("process_mesh")
+                    default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
+                else:
+                    if dist_tensor.dist_attr.process_mesh is None:
+                        dist_tensor.dist_attr.process_mesh = self
+                        dist_tensor.dist_attr.mark_annotated("process_mesh")
+
+        for idx in range(self._old_op_size, new_op_size):
+            op = cur_block.ops[idx]
+            dist_op = default_dist_ctx.get_dist_op_for_program(op)
+            if dist_op is None:
+                dist_op = DistributedOperator(op, {"process_mesh": self})
+                dist_op.dist_attr.mark_annotated("process_mesh")
+                default_dist_ctx.add_dist_op_for_program(dist_op)
+            else:
+                if dist_op.dist_attr.process_mesh is None:
+                    dist_op.dist_attr.process_mesh = self
+                    dist_op.dist_attr.mark_annotated("process_mesh")
+        reset_current_process_mesh()
 
     def __eq__(self, other):
         if not isinstance(other, ProcessMesh):
             return False
-        if self.topology != other.topology or self.processes != other.processes:
+        if self.shape != other.shape or self.process_ids != other.process_ids:
             return False
         return True
 
@@ -135,6 +225,6 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __str__(self):
-        str = "shape {} and process group {}".format(self.topology,
-                                                     self.processes)
+        str = "shape {}, process_ids {}, dim_nams {}".format(
+            self.shape, self.process_ids, self.dim_names)
         return str
diff --git a/python/paddle/distributed/auto_parallel/process_mesh_v2.py b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
index b57cecf41e262..aa9401b5f50e8 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -81,54 +81,57 @@ def mesh(self):
         return self._mesh
 
 
-# def compute_compatible_process_meshes(process_meshes):
-#     """Compute the compatible process mesh given a list of process meshes."""
-#     if not process_meshes:
-#         return None
-
-#     def _compute_compatible_two_process_meshes(pm1, pm2):
-#         if pm1 is None:
-#             return True, pm2
-#         if pm2 is None:
-#             return True, pm1
-#         if pm1 == pm2:
-#             return True, pm1
-#         if pm1.device_mesh != pm2.device_mesh:
-#             return False, None
-#         if pm1.process_ids == pm2.process_ids:
-#             if len(pm1.shape) >= len(pm2.shape):
-#                 return True, pm1
-#             else:
-#                 return True, pm2
-#         process_set1 = set(pm1.process_ids)
-#         process_set2 = set(pm2.process_ids)
-#         if process_set1.issubset(process_set2):
-#             return True, pm2
-#         if process_set2.issubset(process_set1):
-#             return True, pm1
-#         return False, None
-
-#     compatible_result = None
-#     for process_mesh in process_meshes:
-#         compatible, compatible_result = _compute_compatible_two_process_meshes(
-#             compatible_result, process_mesh)
-#         if not compatible:
-#             return None
-#     return ProcessMesh(compatible_result.mesh, compatible_result.dim_names)
-
-# def merge_process_meshes(process_meshes):
-#     """Merge a list of process meshes."""
-#     merged_process_mesh = None
-#     merged_process_ids = set()
-#     device_type = ""
-#     for process_mesh in process_meshes:
-#         if process_mesh is not None:
-#             process_ids = set(process_mesh.process_ids)
-#             if not device_type:
-#                 device_type = process_mesh.device_type
-#             assert device_type != process_mesh.device_type, \
-#                 "All process meshes must have the same device_type."
-#             merged_process_ids.union(process_ids)
-#     if len(merged_process_ids) != 0:
-#         merged_process_mesh = ProcessMesh(list(merged_process_ids))
-#     return merged_process_mesh
+def compute_compatible_process_mesh(process_meshes):
+    """Compute the compatible process mesh given a list of process meshes."""
+    if not process_meshes:
+        return None
+
+    def _compute_compatible_of_two_process_meshes(pm1, pm2):
+        if pm1 is None:
+            return True, pm2
+        if pm2 is None:
+            return True, pm1
+        if pm1 == pm2:
+            return True, pm1
+        if pm1.process_ids == pm2.process_ids:
+            if len(pm1.shape) >= len(pm2.shape):
+                return True, pm1
+            else:
+                return True, pm2
+        process_set1 = set(pm1.process_ids)
+        process_set2 = set(pm2.process_ids)
+        if process_set1.issubset(process_set2):
+            return True, pm2
+        if process_set2.issubset(process_set1):
+            return True, pm1
+        return False, None
+
+    compatible_result = None
+    for process_mesh in process_meshes:
+        compatible, compatible_result = _compute_compatible_of_two_process_meshes(
+            compatible_result, process_mesh)
+        if not compatible:
+            return None
+    if compatible_result.empty():
+        return None
+    if isinstance(compatible_result, core.ProcessMesh):
+        mesh = np.array(compatible_result.process_ids).reshape(
+            compatible_result.shape)
+        return ProcessMesh(mesh, compatible_result.dim_names)
+    elif isinstance(compatible_result, ProcessMesh):
+        return ProcessMesh(compatible_result.mesh, compatible_result.dim_names)
+    else:
+        raise ValueError("Unrecognized ProcessMesh.")
+
+
+def merge_process_mesh(process_meshes):
+    """Merge a list of process meshes."""
+    merged_process_mesh = None
+    merged_process_ids = set()
+    for process_mesh in process_meshes:
+        if process_mesh is not None:
+            process_ids = set(process_mesh.process_ids)
+            merged_process_ids = merged_process_ids.union(process_ids)
+    if len(merged_process_ids) != 0:
+        merged_process_mesh = ProcessMesh(list(merged_process_ids))
+    return merged_process_mesh
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
new file mode 100644
index 0000000000000..c196b321eafd0
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import os
+import copy
+import argparse
+from . import constants
+
+
+class BaseConfig(object):
+
+    def __init__(self, category, config_dict=None):
+        self._category = category
+        self._config_dict = None
+        if config_dict is not None:
+            if isinstance(config_dict, dict):
+                self._config_dict = config_dict
+            else:
+                raise ValueError(
+                    "Expected a dictionary. But received: {}".format(
+                        config_dict))
+        # Initialize attributes by the default config
+        config = constants.get_category_default_config(self._category)
+        for field, default_value in config.items():
+            setattr(self, field, default_value)
+
+        # Overide attributes by the config_dict
+        if self._config_dict:
+            self.from_dict(self._config_dict)
+
+    def from_dict(self, config_dict):
+        config = constants.get_category_default_config(self._category)
+        for field in config.keys():
+            value = config_dict.get(field, constants.NOT_FOUND)
+            # Use the default value if we cannot found the value
+            if value != constants.NOT_FOUND:
+                setattr(self, field, value)
+
+    def to_dict(self):
+        result_dict = {}
+        config = constants.get_category_default_config(self._category)
+        for field in config.keys():
+            value = getattr(self, field)
+            result_dict[field] = value
+        for field, value in self.__dict__.items():
+            if isinstance(value, BaseConfig):
+                result_dict[field] = value.to_dict()
+        return result_dict
+
+    def __repr__(self):
+        return yaml.dump(self.to_dict(),
+                         default_flow_style=False,
+                         sort_keys=True,
+                         indent=4)
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            setattr(result, k, copy.deepcopy(v, memo))
+        return result
+
+
+class RecomputeConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.RECOMPUTE
+        super(RecomputeConfig, self).__init__(category, config_dict)
+
+
+class AMPConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.AMP
+        super(AMPConfig, self).__init__(category, config_dict)
+
+
+class ShardingConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.SHARDING
+        super(ShardingConfig, self).__init__(category, config_dict)
+
+
+class GradientMergeConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.GRADIENT_MERGE
+        super(GradientMergeConfig, self).__init__(category, config_dict)
+
+
+class QATConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.QAT
+        super(QATConfig, self).__init__(category, config_dict)
+
+
+class TuningConfig(BaseConfig):
+
+    def __init__(self, config_dict=None):
+        category = constants.TUNING
+        super(TuningConfig, self).__init__(category, config_dict)
+
+
+class Strategy(BaseConfig):
+    """
+    The `Strategy` object is used to configure the paralleization and optimization beheviors. 
+
+    Args:
+        config (dict|string, optional): If this is None, the default configurations will used.
+        If this is a dictionary, the recognized key-value of it will be used to override the default
+        configurations while other default configurations are left unchanged. If this is a string,
+        it is interpreted as the path to a YAML configuration and will be loaded to override the
+        corresponding default configurations.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.distributed.auto_parallel as auto
+
+            strategy = auto.Strategy()
+            sharding = strategy.sharding
+            self.assertEqual(sharding.enabled, False)
+            self.assertEqual(sharding.stage, 1)
+            self.assertEqual(sharding.sharding_degree, 8)
+            sharding.enabled = True
+            sharding.stage = 2
+            sharding.sharding_degree = 2
+            self.assertEqual(sharding.enabled, True)
+            self.assertEqual(sharding.stage, 2)
+            self.assertEqual(sharding.sharding_degree, 2)
+
+    """
+
+    def __init__(self, config=None):
+        if config is not None:
+            if isinstance(config, dict):
+                self._config_dict = copy.deepcopy(config)
+            # elif os.path.exists(config):
+            #     with open(config, "rb") as yaml_file:
+            #         self._config_dict = yaml.load(yaml_file, Loader=yaml.Loader)
+            else:
+                raise ValueError(
+                    "Expected a dictionary. But received: {}".format(config))
+        else:
+            self._config_dict = {}
+
+        category = constants.BASE
+        super(Strategy, self).__init__(category, self._config_dict)
+
+        config_dict = self._config_dict.get(constants.RECOMPUTE, None)
+        self.recompute = RecomputeConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.AMP, None)
+        self.amp = AMPConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.SHARDING, None)
+        self.sharding = ShardingConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.GRADIENT_MERGE, None)
+        self.gradient_merge = GradientMergeConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.QAT, None)
+        self.qat = QATConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.TUNING, None)
+        self.tuning = TuningConfig(config_dict)
diff --git a/python/paddle/distributed/auto_parallel/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
index c733b6663dee6..f892a7838fe7a 100644
--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -16,7 +16,7 @@
 from abc import ABC, abstractmethod
 import logging
 
-from paddle.distributed.utils.log_utils import get_logger
+from ..utils import get_logger
 from .trial import TrialStatus
 from .trial import OptimizationTunerTrial as Trial
 
@@ -110,13 +110,13 @@ class ShardingStageAlgorithm(AlgorithmBase):
     # TODO import trial class & copy strategy
     def __init__(self, config):
         super().__init__(config)
-        self._changed_configs = ["sharding_configs"]
+        self._changed_configs = ["sharding"]
 
     def _init_spaces(self):
         self._max_stage = 3
         self._trial_idx = 0
 
-        stage_range = self._config.sharding_configs.get("stage_range", None)
+        stage_range = self._config.sharding.to_dict().get("tuning_range", None)
         if stage_range:
             assert set(stage_range).issubset(
                 set([0, 1, 2, 3])
@@ -136,9 +136,8 @@ def next_trial(self):
             stage = self._stage_range[self._trial_idx]
 
             new_strategy = copy.deepcopy(self._config.dist_strategy)
-            config_dict = new_strategy.sharding_configs
-            config_dict["stage"] = stage
-            new_strategy.sharding_configs = config_dict
+            sharding = new_strategy.sharding
+            sharding.stage = stage
 
             name = "trial-sharding-stage{}".format(stage)
             trial = Trial(new_strategy, name, self.changed_configs)
diff --git a/python/paddle/distributed/auto_parallel/tuner/config.py b/python/paddle/distributed/auto_parallel/tuner/config.py
index 151a9a8bc76aa..3083298eff87d 100644
--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/tuner/config.py
@@ -17,15 +17,13 @@
 import pathlib
 
 import paddle
-from paddle.distributed import fleet
+from ..strategy import Strategy
 
 _tuning_supported_passes = ["sharding", "recompute"]
-_strategy_config_suffiex = "_configs"
 
 
 def _get_pass_config(strategy, pass_name):
-    config_name = pass_name + _strategy_config_suffiex
-    config = getattr(strategy, config_name)
+    config = getattr(strategy, pass_name)
     return config
 
 
@@ -38,10 +36,8 @@ class TuningConfig(object):
 
     def __init__(self, user_config, strategy):
 
-        if not isinstance(strategy, fleet.DistributedStrategy):
-            raise TypeError(
-                "'strategy' must be object of class `fleet.DistributedStrategy`."
-            )
+        if not isinstance(strategy, Strategy):
+            raise TypeError("'strategy' must be object of class `Strategy`.")
 
         if not user_config:
             user_config = {}
@@ -116,11 +112,11 @@ def _initialize(self, user_config):
 
         for p in _tuning_supported_passes:
             if getattr(self._dist_strategy, p) and _get_pass_config(
-                    self._dist_strategy, p)["enable_tuning"]:
+                    self._dist_strategy, p).enable_tuning:
                 # TODO distinguish different args of each passes
                 self._tuning_passes_name.add(p)
 
-                config_name = p + _strategy_config_suffiex
+                config_name = p
                 p_dict = getattr(self._dist_strategy, config_name)
                 self.__dict__[config_name] = p_dict
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
index 7bd1c1b873e97..a2da7396ce83c 100644
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# import yaml
 import os
 import sys
 import copy
@@ -29,7 +30,6 @@
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.passes import new_pass, PassContext
-from paddle.distributed.utils.log_utils import get_logger
 
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
 from paddle.distributed.auto_parallel.completion import Completer
@@ -39,6 +39,7 @@
 from paddle.distributed.auto_parallel.utils import debug_program
 from paddle.distributed.auto_parallel.utils import make_data_unshard, set_grad_var_shape
 
+from ..utils import get_logger
 from .config import TuningConfig
 from .algorithms import new_algorithm
 from .trial import TrialStatus
@@ -256,8 +257,8 @@ def _apply_optimization(self, trial):
         startup_program = dist_context.serial_startup_program
 
         # applying optimization pass
-        if new_strategy.amp:
-            config = copy.deepcopy(new_strategy.amp_configs)
+        if new_strategy.amp.enable:
+            config = copy.deepcopy(new_strategy.amp.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_context._params_grads
 
@@ -275,8 +276,8 @@ def _apply_optimization(self, trial):
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
                                              pass_context)
 
-        if new_strategy.recompute:
-            config = copy.deepcopy(new_strategy.recompute_configs)
+        if new_strategy.recompute.enable:
+            config = copy.deepcopy(new_strategy.recompute.to_dict())
             config["dist_context"] = dist_context
             config["no_grad_set"] = None
             config["loss"] = dist_context.serial_loss
@@ -303,8 +304,8 @@ def _apply_optimization(self, trial):
                               dist_context, dist_params_grads)
         resharder.reshard()
 
-        if new_strategy.sharding:
-            config = copy.deepcopy(new_strategy.sharding_configs)
+        if new_strategy.sharding.enable:
+            config = copy.deepcopy(new_strategy.sharding.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_params_grads
             config["global_rank"] = self.rank
@@ -313,8 +314,8 @@ def _apply_optimization(self, trial):
             auto_parallel_sharding_pass.apply([dist_main_prog],
                                               [dist_startup_prog], pass_context)
 
-        if new_strategy.gradient_merge:
-            config = copy.deepcopy(new_strategy.gradient_merge_configs)
+        if new_strategy.gradient_merge.enable:
+            config = copy.deepcopy(new_strategy.gradient_merge.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_params_grads
             auto_parallel_gradient_merge_pass = new_pass(
@@ -492,9 +493,10 @@ def summary(self):
             for line in summary_.split("\n"):
                 fw.write(line + "\n")
 
-        full_strategy = self.get_best_config()
-        full_strategy.save_to_prototxt(
-            os.path.join(self.project_dir, "tuned_dist_strategy.prototxt"))
+        # full_strategy = self.get_best_config()
+        # path = os.path.join(self.project_dir, "tuned_dist_strategy.yaml")
+        # with open(path, 'w') as outfile:
+        #     yaml.dump(full_strategy, outfile, default_flow_style=False)
 
     def clear(self):
         """
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
index 3937ca9865181..edc588b4c70fe 100644
--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -156,9 +156,10 @@ def summary(self):
             draws += h1_format.format("{} auto=True <-> {}".format(name, name))
             draws += line + "\n"
             my_configs = getattr(self.space, name)
-            keys = my_configs.keys()
+            keys = my_configs.to_dict().keys()
             for key in keys:
-                draws += h2_format.format(key, str(my_configs.get(key, None)))
+                draws += h2_format.format(
+                    key, str(my_configs.to_dict().get(key, None)))
 
         result_res = draws + border
         return result_res
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index bc797530b7535..ef165f5ff086d 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -28,6 +28,19 @@
 from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
 
 
+def get_logger(log_level, name="auto_parallel"):
+    logger = logging.getLogger(name)
+    logger.propagate = False
+    if not logger.handlers:
+        logger.setLevel(log_level)
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
+    return logger
+
+
 def is_valid_list_index(list, index):
     if index >= -len(list) and index < len(list):
         return True
@@ -49,6 +62,58 @@ def is_dim_replicate(mapping):
         return False
 
 
+def verify_dims_mapping(dims_mapping, process_mesh):
+    if dims_mapping is None:
+        return False
+    if not all(isinstance(d, int) for d in dims_mapping):
+        return False
+    for i in range(len(dims_mapping)):
+        if dims_mapping[i] < -1 or dims_mapping[i] >= len(process_mesh.shape):
+            return False
+    for i in range(len(process_mesh.shape)):
+        if dims_mapping.count(i) > 1:
+            return False
+    return True
+
+
+def convert_to_dims_mapping(shard_spec, process_mesh):
+    dims_mapping = []
+    for shard in shard_spec:
+        if shard is None:
+            dims_mapping.append(-1)
+        else:
+            dims_mapping.append(process_mesh.dim_names.index(shard))
+    return dims_mapping
+
+
+def convert_to_shard_spec(dims_mapping, process_mesh):
+    shard_spec = []
+    for dim_mapping in dims_mapping:
+        if dim_mapping == -1:
+            shard_spec.append(None)
+        else:
+            shard_spec.append(process_mesh.dim_names[dim_mapping])
+    return shard_spec
+
+
+def verify_shard_spec(shard_spec, tensor_shape, process_mesh):
+    if len(shard_spec) != len(tensor_shape):
+        return False
+    for shard in shard_spec:
+        if shard is not None and not isinstance(shard, str):
+            return False
+        if shard is not None and shard not in process_mesh.dim_names:
+            return False
+    dims_mapping = convert_to_dims_mapping(shard_spec, process_mesh)
+    if not verify_dims_mapping(dims_mapping, process_mesh):
+        return False
+    for i in range(len(tensor_shape)):
+        if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+            and tensor_shape[i] % process_mesh.shape[dims_mapping[i]] != 0:
+            return False
+    return True
+
+
 def compute_compatible_dim_mapping(dim_mappings):
     if not dim_mappings:
         return None
@@ -1040,7 +1105,7 @@ def set_grad_var_shape(program, dist_context):
 
             if op.type in [
                     "c_allreduce_sum", "c_identity", "scale", "cast",
-                    'fill_any_like'
+                    "fill_any_like"
             ]:
                 forward_var_name = op.input_arg_names[0]
             elif op.type == "matmul_v2_grad" or op.type == "matmul_grad" or op.type == "mul_grad":
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 57731b8ad0ed8..5ceb046f55072 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -40,46 +40,23 @@
 from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid.dygraph_utils as dygraph_utils
 import contextlib
+from .fleet.layers.mpu.mp_ops import split
+from .fleet.layers.mpu.mp_ops import _c_identity
+from .fleet.layers.mpu.mp_ops import _c_concat
+from .fleet.layers.mpu.mp_ops import _c_split
+from .fleet.layers.mpu.mp_ops import _mp_allreduce
+from .fleet.layers.mpu.mp_ops import _c_lookup_table
+from .fleet.layers.mpu.mp_ops import _Linear
+from .fleet.layers.mpu.mp_ops import _set_var_distributed
+from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy
+from .fleet.layers.mpu.mp_ops import _linear
+from .fleet.layers.mpu.mp_ops import _parallel_linear
+from .fleet.layers.mpu.mp_ops import _parallel_embedding
+from .communication.comm_utils import ReduceOp
 
 __all__ = []
 
 
-class ReduceOp:
-    """
-    Specify the type of operation used for element-wise reductions.
-    It should be one of the following values:
-
-        ReduceOp.SUM
-
-        ReduceOp.MAX
-
-        ReduceOp.MIN
-
-        ReduceOp.PROD
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed as dist
-
-            dist.init_parallel_env()
-            if dist.get_rank() == 0:
-                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-            else:
-                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
-            dist.all_reduce(data, op=dist.ReduceOp.SUM)
-            print(data)
-            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
-    """
-    SUM = 0
-    MAX = 1
-    MIN = 2
-    PROD = 3
-    AVG = 4
-
-
 class Group():
     """
     The abstract representation of group.
@@ -1259,747 +1236,6 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
                      })
 
 
-def _c_identity(tensor, group=None):
-    """
-    Return a copy of the tensor, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
-                                        'ring_id', ring_id,
-                                        'use_model_parallel', True)
-    op_type = 'c_identity'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_identity')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'use_model_parallel': True,
-                     })
-    return out
-
-
-def _c_concat(tensor, group=None):
-    """
-    Return allgather of the tensor, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    group = _get_default_group() if group is None else group
-    ring_id = group.id
-
-    global_rank = _get_global_env().rank
-    rank = group.rank
-    nranks = group.nranks
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_concat(tensor, 'ring_id', ring_id,
-                                      'use_calc_stream', True, 'rank', rank,
-                                      'nranks', nranks, 'use_model_parallel',
-                                      True)
-
-    op_type = 'c_concat'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_concat')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'use_model_parallel': True,
-                         'nranks': nranks,
-                         'rank': rank
-                     })
-    return out
-
-
-def _c_split(tensor, group=None):
-    """
-    Split tensor evenly among all members, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        rank (int): The rank of the current process.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    global_rank = _get_global_env().rank
-    rank = global_rank if group is None else group.get_group_rank(global_rank)
-    nranks = _get_global_env().world_size if group is None else group.nranks
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
-                                     ring_id, 'rank', rank, 'nranks', nranks,
-                                     'use_model_parallel', True)
-
-    op_type = 'c_split'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_split')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'rank': rank,
-                         'nranks': nranks,
-                         'use_model_parallel': True,
-                     })
-    return out
-
-
-def _mp_allreduce(tensor,
-                  op=ReduceOp.SUM,
-                  group=None,
-                  use_calc_stream=True,
-                  use_model_parallel=True):
-    """[it is same as allreduce above, but it supports model parallel. And it support inplace startegy]
-    """
-    if group is not None and not group.is_member():
-        return
-
-    if in_dygraph_mode():
-        group = _get_default_group() if group is None else group
-        assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op)
-
-        from paddle.autograd import PyLayer
-
-        class mp_allreduce_eager(PyLayer):
-
-            @staticmethod
-            def forward(ctx, tensor, group, use_calc_stream,
-                        use_model_parallel):
-                ctx.ring_id = group.id
-
-                if use_calc_stream:
-                    op_type = _get_reduce_op(op, "_mp_allreduce")
-                    group.process_group.allreduce_on_calc_stream(
-                        tensor, op_type)
-                    return tensor
-                else:
-                    return _legacy_C_ops.c_allreduce_sum_(
-                        tensor, 'use_calc_stream', use_calc_stream, 'ring_id',
-                        ring_id, "use_model_parallel", use_model_parallel)
-
-            @staticmethod
-            def backward(ctx, dy):
-                return _legacy_C_ops.c_identity(dy, 'use_calc_stream', True,
-                                                'ring_id', ctx.ring_id,
-                                                'use_model_parallel', True)
-
-        return mp_allreduce_eager.apply(tensor, group, use_calc_stream,
-                                        use_model_parallel)
-
-    ring_id = 0 if group is None else group.id
-    if _in_legacy_dygraph():
-        if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id, "use_model_parallel",
-                                                  use_model_parallel)
-        else:
-            raise ValueError("Unknown parameter: {}.".format(op))
-
-    op_type = 'c_allreduce_sum'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream,
-                         'use_model_parallel': use_model_parallel,
-                     })
-    return out
-
-
-def _c_lookup_table(table, index, start_index=0, name=None):
-    """
-    Lookup table according to index.
-
-    Args:
-        table (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64.
-        index (Tensor): The index to lookup table.
-        start_index (int): The initial index for table range.
-        name (string): The name of the api
-
-    Returns:
-        Tensor.
-    """
-    if _non_static_mode():
-        return _legacy_C_ops.c_embedding(table, index, "start_index",
-                                         start_index)
-
-    op_type = 'c_embedding'
-    helper = LayerHelper(op_type, **locals())
-    dtype = helper.input_dtype(input_param_name='table')
-    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='c_embedding',
-                     inputs={
-                         'Ids': index,
-                         'W': table
-                     },
-                     outputs={'Out': tmp},
-                     attrs={"start_index": start_index})
-    return tmp
-
-
-class _Linear(layers.Layer):
-    """
-    Linear
-    """
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
-        super(_Linear, self).__init__()
-        self._dtype = self._helper.get_default_dtype()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self.weight = self.create_parameter(shape=[in_features, out_features],
-                                            attr=self._weight_attr,
-                                            dtype=self._dtype,
-                                            is_bias=False)
-        self.bias = self.create_parameter(shape=[out_features],
-                                          attr=self._bias_attr,
-                                          dtype=self._dtype,
-                                          is_bias=True)
-        self.name = name
-
-    def forward(self, input):
-        out = _linear(x=input,
-                      weight=self.weight,
-                      bias=self.bias,
-                      name=self.name)
-        return out
-
-    def extra_repr(self):
-        name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
-
-
-def _c_softmax_with_cross_entropy(logits,
-                                  label,
-                                  group=None,
-                                  return_softmax=False):
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-    global_rank = _get_global_env().rank
-    rank = global_rank if group is None else group.get_group_rank(global_rank)
-    nranks = _get_global_env().world_size if group is None else group.nranks
-
-    input_dims = len(list(logits.shape))
-    label_dims = len(list(label.shape))
-    if input_dims - 1 != label_dims and input_dims != label_dims:
-        raise ValueError(
-            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
-    if input_dims - 1 == label_dims:
-        label = paddle.unsqueeze(label, axis=-1)
-
-    if _non_static_mode():
-        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
-            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'ring_id': ring_id,
-        'rank': rank,
-        'nranks': nranks,
-    }
-    helper = LayerHelper('c_softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(type='c_softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs={
-                         'Softmax': softmax,
-                         'Loss': loss
-                     },
-                     attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
-
-
-def _linear(x, weight, bias=None, name=None):
-    """
-    Fuction Linear
-    """
-    if _non_static_mode():
-        pre_bias = _varbase_creator(dtype=x.dtype)
-        _legacy_C_ops.matmul(x, weight, pre_bias, 'transpose_X', False,
-                             'transpose_Y', False, "alpha", 1)
-        return dygraph_utils._append_bias_in_dygraph(pre_bias,
-                                                     bias,
-                                                     axis=len(x.shape) - 1)
-    else:
-        helper = LayerHelper('linear', **locals())
-        dtype = x.dtype
-        assert len(
-            x.shape) < 4, "X latitude is not supported greater than 3 now."
-
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'linear')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
-
-        inputs = {'X': [x], 'Y': [weight]}
-        attrs = {
-            'transpose_X': False,
-            'transpose_Y': False,
-            'alpha': 1,
-        }
-        tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='matmul_v2',
-                         inputs=inputs,
-                         outputs={'Out': tmp},
-                         attrs=attrs)
-        if bias is not None:
-            res = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [tmp],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [res]},
-                             attrs={'axis': len(x.shape) - 1})
-        else:
-            res = tmp
-        return res
-
-
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-def _parallel_linear(x,
-                     num_rows,
-                     num_cols,
-                     axis,
-                     param_attr,
-                     bias_attr,
-                     gather_out,
-                     inner_rank,
-                     nranks,
-                     split_tensor,
-                     name,
-                     group=None):
-    """
-    Parallel Linear
-
-    axis the dimension of the parameter of linear layer.
-    axis = 0: the row dimension
-    axis = 1: the col dimension
-
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    if axis == 0:
-        if split_tensor:
-            x = _c_split(x, group=group)
-    else:
-        x = _c_identity(x, group=group)
-
-    linear = paddle.nn.Linear(num_rows,
-                              num_cols,
-                              weight_attr=param_attr,
-                              bias_attr=bias_attr,
-                              name=name)
-
-    # NOTE: npu linear function use matmul_v2 but linear use matmul
-    linear_function = _linear if core.is_compiled_with_npu()\
-        else paddle.nn.functional.linear
-    linear_out = linear_function(
-        x,
-        linear.weight,
-        # NOTE(wangxi): row split, bias need add after allreduce
-        None if axis == 0 else linear.bias,
-        linear.name)
-
-    _set_var_distributed(linear.weight)
-    # set is_distributed for splited bias
-    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
-    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
-    if axis == 1 and linear._bias_attr != False:
-        _set_var_distributed(linear.bias)
-
-    if not gather_out: return linear_out
-
-    out_shape = list(linear_out.shape)
-    out_shape[0] *= 1 if axis == 0 else nranks
-    main_block = paddle.static.default_main_program().current_block()
-    out = main_block.create_var(
-        shape=out_shape,
-        dtype=linear_out.dtype,
-        type=linear_out.type,
-        lod_level=linear_out.lod_level,
-        persistable=False,
-        is_data=False,
-        need_check_feed=linear_out.desc.need_check_feed())
-    if axis == 0:
-        main_block.append_op(type='c_allreduce_sum',
-                             inputs={'X': linear_out},
-                             outputs={'Out': out},
-                             attrs={
-                                 'ring_id': ring_id,
-                                 'use_calc_stream': True,
-                                 'use_model_parallel': True
-                             })
-        if linear.bias is not None:
-            out = out + linear.bias
-    else:
-        main_block.append_op(type='c_concat',
-                             inputs={'X': linear_out},
-                             outputs={'Out': out},
-                             attrs={
-                                 'rank': inner_rank,
-                                 'ring_id': ring_id,
-                                 'nranks': nranks,
-                                 'use_calc_stream': True,
-                                 'use_model_parallel': True
-                             })
-    return out
-
-
-def _parallel_embedding(x,
-                        per_part_embeddings,
-                        origin_size,
-                        param_attr,
-                        inner_rank,
-                        num_partitions,
-                        name,
-                        group=None):
-    """
-    Parallel Embedding
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    helper = LayerHelper("_parallel_embedding", **locals())
-
-    per_part_size = per_part_embeddings
-    rank = inner_rank
-
-    vocab_start_index = rank * per_part_size
-    dtype = helper.get_default_dtype()
-    size = [per_part_size, origin_size[1]]
-
-    weight = helper.create_parameter(attr=param_attr,
-                                     shape=size,
-                                     dtype=dtype,
-                                     is_bias=False)
-
-    if num_partitions == 1:
-        return paddle.nn.functional.embedding(x,
-                                              weight=weight,
-                                              padding_idx=None,
-                                              sparse=False,
-                                              name=name)
-
-    startup_block = paddle.static.default_startup_program().global_block()
-    main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    output_parallel = paddle.distributed.collective._c_lookup_table(
-        weight, x, start_index=vocab_start_index, name=name)
-    out = paddle.distributed.collective._mp_allreduce(output_parallel,
-                                                      group=group,
-                                                      use_calc_stream=True,
-                                                      use_model_parallel=True)
-    return out
-
-
-def split(x,
-          size,
-          operation,
-          axis=0,
-          num_partitions=1,
-          gather_out=True,
-          weight_attr=None,
-          bias_attr=None,
-          name=None):
-    """
-
-    Split the weight of the specified operation into multiple devices
-    and do the computation in parallel.
-
-    Now the following three cases are supported.
-
-    Case 1: Parallel Embedding
-        The weight of the embedding operation is a NxM matrix with N rows and M columns.
-        With parallel embedding, the weight is split into num_partitions partitions, each
-        of which is a matrix with (N/num_partitions + 1) rows and M column where the last
-        row as the padding idx.
-
-        Suppose we split the NxM weight into two partitons on device_0 and device_1
-        respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
-        index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
-        keep unchanged and all other values are changed to N/2 which is the padding index and
-        are mapped to all zeros after embedding. In the same way, on device_1, the value V in the
-        input within [N/2, N-1] will be changed to (V - N/2), and all other values are changed
-        to N/2 and are mapped to all zeros after embedding. Finally, the results on the two
-        devices are sum-reduced.
-
-        The Embedding put on single card is as shown below:
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_single.png
-            :width: 800
-            :height: 350
-            :alt: single_embedding
-            :align: center
-
-        Parallel Embedding is shown as below:
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_split.png
-            :width: 800
-            :alt: split_embedding
-            :align: center
-
-    Case 2: Row Parallel Linear
-        The weight of the linear operation is a NxM matrix with N rows and M columns.
-        With row parallel linear, the weight is split into num_partitions partitions, each
-        of which is a matrix with N/num_partitions rows and M column.
-
-        The linear layer put on single card is shown as below, the input variable is represented by X,
-        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
-        simple matrix multiplication operation, O = X * W.
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
-            :width: 800
-            :alt: single_linear
-            :align: center
-
-        Row Parallel Linear is shown as below. As the name suggests, Row Parallel Linear splits the weight matrix W into
-        [[W_row1], [W_row2]] along the row. And accordingly the input is splitted along the column into [X_col1, X_col2] and multiply their
-        respective weight matrices. Finally apply AllReduce on the output from each card to get the final output.
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_row.png
-            :width: 800
-            :alt: split_row
-            :align: center
-
-    Case 3: Column Parallel Linear
-        The weight of the linear operation is a NxM matrix with N rows and M columns.
-        With column parallel linear, the weight is split into num_paratitions partitions, each
-        of which is a matrix with N rows and M/num_partitions column.
-
-        The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
-        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
-        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
-            :width: 800
-            :alt: split_col
-            :align: center
-
-    As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
-    operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col_row.png
-            :width: 800
-            :alt: split_col_row
-            :align: center
-
-    Args:
-        x (Tensor): Input tensor. It's data type should be float16, float32, float64, int32 or int64.
-        size (list|tuple): A list or tuple with two elements indicating the shape of the weight.
-        operation (str): The name of the operation. The supported operations are 'linear' and 'embedding'.
-        axis (int, Optional): Indicate along which axis to split the weight. Default: 0.
-        num_partitions (int, Optional): How many parts the weight is partitioned. Default: 1.
-        gather_out (bool, Optional): Whether to gather the output after computation. By default, the output
-            on each partitions will be gathered after computation. Default: True.
-        weight_attr (ParamAttr, Optional): The parameter attribute for the learnable
-            weights(Parameter) of the specified operation. Default: None.
-        bias_attr (ParamAttr, Optional): The parameter attribute for the bias
-            of the specified operation. Default: None.
-        name (str, Optional): The default value is None. Normally there is no need for user to set this
-            property. Default: None. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed.fleet as fleet
-
-            paddle.enable_static()
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            fleet.init(is_collective=True)
-            data = paddle.randint(0, 8, shape=[10,4])
-            emb_out = paddle.distributed.split(
-                data,
-                (8, 8),
-                operation="embedding",
-                num_partitions=2)
-
-    """
-    assert isinstance(
-        size,
-        (list, tuple)), ("The type of size for "
-                         "paddle.distributed.split must be list or tuple.")
-    assert len(size) == 2, ("Number of elements in size of "
-                            "paddle.distributed.split must be two.")
-    assert isinstance(operation, str), ("The type of operation for "
-                                        "paddle.distributed.split must be str.")
-    supported_operations = [
-        'linear',
-        'embedding',
-    ]
-    assert operation in supported_operations, (
-        "The operation for "
-        "paddle.distributed.split must be one of {}.".format(
-            supported_operations))
-    if _non_static_mode():
-        raise ValueError(
-            "paddle.distributed.split cannot be used in dynamic "
-            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
-            "ParallelColumnLinear instead.")
-    else:
-        from .fleet import fleet
-        assert fleet._role_maker, ("To use paddle.distributed.split, "
-                                   "you must call fleet.init() firstly.")
-        rank = fleet.worker_index()
-        nranks = fleet.worker_num()
-
-    # rank within a model parallel group
-    inner_rank = rank % num_partitions
-
-    if operation == "embedding":
-        assert axis == 0, ("We only support to split the weight of embedding "
-                           "along the first axis now.")
-        assert size[0] % num_partitions == 0, \
-            "The length of the vocabulary must be divisible by num_partitions " \
-            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
-
-        per_part_size = size[0] // num_partitions
-        emb_out = _parallel_embedding(x,
-                                      per_part_size,
-                                      size,
-                                      weight_attr,
-                                      inner_rank,
-                                      num_partitions,
-                                      name,
-                                      group=None)
-        return emb_out
-    else:
-        should_split = False
-        if axis == 0:
-            assert size[0] % num_partitions == 0, (
-                "Number of rows of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[0], num_partitions))
-            per_part_size = size[0] // num_partitions
-            linear_size = (per_part_size, size[1])
-            if x.shape[-1] == size[0]: should_split = True
-
-        elif axis == 1:
-            assert size[1] % num_partitions == 0, (
-                "Number of column of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[1], num_partitions))
-            per_part_size = size[1] // num_partitions
-            linear_size = (size[0], per_part_size)
-        else:
-            raise ValueError("The value of axis must be 0 or 1, but the value "
-                             "given is {}.".format(axis))
-
-        linear_out = _parallel_linear(x,
-                                      linear_size[0],
-                                      linear_size[1],
-                                      axis,
-                                      weight_attr,
-                                      bias_attr,
-                                      gather_out,
-                                      inner_rank,
-                                      num_partitions,
-                                      should_split,
-                                      name=name,
-                                      group=None)
-        return linear_out
-
-
 def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
     """
     Scatter tensors in in_tensor_list to all participators averagely and gather the result tensors in out_tensor_list.
diff --git a/python/paddle/distributed/communication/comm_utils.py b/python/paddle/distributed/communication/comm_utils.py
new file mode 100644
index 0000000000000..62e1bcb4cca94
--- /dev/null
+++ b/python/paddle/distributed/communication/comm_utils.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ReduceOp:
+    """
+
+    Specify the type of operation used for element-wise reductions.
+    It should be one of the following values:
+
+        ReduceOp.SUM
+
+        ReduceOp.MAX
+
+        ReduceOp.MIN
+
+        ReduceOp.PROD
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            dist.all_reduce(data, op=dist.ReduceOp.SUM)
+            print(data)
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
+    """
+    SUM = 0
+    MAX = 1
+    MIN = 2
+    PROD = 3
+    AVG = 4
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index 24194dd9fb1e2..3dd9f60b81295 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 
 from .all_reduce import all_reduce
+from .send import send
+from .recv import recv
 
-__all__ = ["all_reduce"]
+__all__ = ["all_reduce", "send", "recv"]
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 6a0b622cf0dfe..f94422f4bd0a6 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle.distributed.collective as collective
 import paddle.fluid.framework as framework
-from ...collective import _get_default_group, _get_reduce_op, ReduceOp
 
 
 def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
-    op_type = _get_reduce_op(op, "all_reduce")
-    group = _get_default_group() if group is None else group
+    op_type = collective._get_reduce_op(op, "all_reduce")
+    group = collective._get_default_group() if group is None else group
     if use_calc_stream:
         return group.process_group.allreduce_on_calc_stream(tensor, op_type)
 
@@ -30,7 +30,7 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
 
 
 def all_reduce(tensor,
-               op=ReduceOp.SUM,
+               op=collective.ReduceOp.SUM,
                group=None,
                sync_op=True,
                use_calc_stream=False):
diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py
new file mode 100644
index 0000000000000..b225f64b8b4d2
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.collective as collective
+import paddle.fluid.framework as framework
+
+
+def _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.recv_on_calc_stream(tensor, src)
+
+    task = group.process_group.recv(tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Receive a tensor from the source device.
+
+    Args:
+        tensor (Tensor): The tensor to receive. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.recv is only supported in dygraph mode now.")
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
new file mode 100644
index 0000000000000..fa052734c7ee7
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.distributed.collective as collective
+import paddle.fluid.framework as framework
+
+
+def _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.send_on_calc_stream(tensor, dst)
+
+    task = group.process_group.send(tensor, dst, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Send a tensor to the destination device.
+
+    Args:
+        tensor (Tensor): The tensor to send. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.send is only supported in dygraph mode now.")
diff --git a/python/paddle/distributed/fleet/layers/mpu/__init__.py b/python/paddle/distributed/fleet/layers/mpu/__init__.py
new file mode 100644
index 0000000000000..11b6970265003
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/__init__.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_layers import VocabParallelEmbedding
+from .mp_layers import ColumnParallelLinear
+from .mp_layers import RowParallelLinear
+from .mp_layers import ParallelCrossEntropy
+
+from .random import RNGStatesTracker
+from .random import get_rng_state_tracker
+from .random import model_parallel_random_seed
+from .random import determinate_seed
+from .random import dropout
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
new file mode 100644
index 0000000000000..2ba9ce9ed76a9
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -0,0 +1,466 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from . import mp_ops
+from paddle.fluid import core
+from paddle.fluid.dygraph.layers import Layer
+from .random import get_rng_state_tracker
+from paddle.nn import functional as F
+from paddle import framework
+from paddle.autograd import PyLayer
+from ...base import topology as tp
+
+__all__ = []
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+        return hasattr(core.ops, 'fused_gemm_epilogue')
+    else:
+        return False
+
+
+class VocabParallelEmbedding(Layer):
+    """Embedding mp parallelized in the vocabulary dimension.
+    this class is used for splitting embedding in mp group.
+
+    Args:
+        num_embeddings(int): One element which indicate the size of the dictionary of embeddings.
+        embedding_dim(int): One element which indicate the size of each embedding vector respectively.
+        weight_attr(ParamAttr|None): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): For detailed information, please refer
+               to :ref:`api_guide_Name`. Usually name is no need to set and
+               None by default.
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 weight_attr=None,
+                 mp_group=None,
+                 name=None):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+        self.origin_num_embeddings = num_embeddings
+        self.is_mp = (self.world_size > 1)
+
+        assert num_embeddings % self.world_size == 0, (
+            "The length of the vocabulary must be divisible by the parallelism degree of MP"
+        )
+
+        per_part_size = num_embeddings // self.world_size
+
+        self.vocab_start_index = self.rank * per_part_size
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [per_part_size, embedding_dim]
+        self._weight_attr = weight_attr
+        self._name = name
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(attr=self._weight_attr,
+                                                    shape=self._size,
+                                                    dtype=self._dtype,
+                                                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(attr=self._weight_attr,
+                                                shape=self._size,
+                                                dtype=self._dtype,
+                                                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+    def forward(self, x):
+        if self.is_mp:
+            output_parallel = mp_ops._c_lookup_table(
+                self.weight,
+                x,
+                start_index=self.vocab_start_index,
+                name=self._name)
+            output = mp_ops._mp_allreduce(output_parallel,
+                                          group=self.model_parallel_group,
+                                          use_calc_stream=True,
+                                          use_model_parallel=True)
+        else:
+            output = F.embedding(x,
+                                 weight=self.weight,
+                                 padding_idx=None,
+                                 sparse=False,
+                                 name=self._name)
+        return output
+
+
+class ColumnParallelLinear(Layer):
+    """Linear layer with mp parallelized(column).
+    this class is used for splitting Linear Layer in mp group, column split the weight of the Linear layer.
+
+    Args:
+        in_features(int): The number of input units.
+        out_features(int): The number of output units.
+        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
+            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
+        has_bias(bool): whether to add bias.
+        gather_output(bool): whether to do allgahter for the output of each rank.
+        fuse_matmul_bias(bool): whether to fuse matmul and bias.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=None,
+                 gather_output=True,
+                 fuse_matmul_bias=False,
+                 mp_group=None,
+                 name=None):
+        super(ColumnParallelLinear, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self._name = name
+        self.is_mp = (self.world_size > 1)
+
+        self.gather_output = gather_output
+        assert out_features % self.world_size == 0, (
+            "Number of column of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(
+                out_features, self.world_size))
+        self.output_size_per_partition = out_features // self.world_size
+
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[in_features, self.output_size_per_partition],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[in_features, self.output_size_per_partition],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+        if has_bias:
+            # initialize bias to zero like Megatron
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True)
+            self.bias.is_distributed = True if self.is_mp else False
+        else:
+            self.bias = None
+
+        self.linear = F.linear
+
+        if fuse_matmul_bias:
+            if not is_fused_matmul_bias_supported():
+                raise NotImplementedError(
+                    "You set fuse_matmul_bias=True in ColumnParallelLinear, "
+                    "however, the paddle you are using not support this operation. "
+                    "Please set fuse_matmul_bias=False or use paddle compiled "
+                    "with cuda 11.6 or higher.")
+            from paddle.incubate.nn.functional import fused_linear
+            self.linear = fused_linear
+
+    def forward(self, x):
+        # use inner api to process identity
+        if self.is_mp:
+            input_parallel = mp_ops._c_identity(x,
+                                                group=self.model_parallel_group)
+        else:
+            input_parallel = x
+
+        output_parallel = self.linear(input_parallel,
+                                      self.weight,
+                                      self.bias,
+                                      name=self._name)
+
+        if self.gather_output and self.is_mp:
+            output = mp_ops._c_concat(output_parallel,
+                                      group=self.model_parallel_group)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(Layer):
+    """Linear layer with mp parallelized(row).
+    this class is used for splitting Linear Layer in mp group, row split the weight of the Linear layer.
+
+    Args:
+        in_features(int): The number of input units.
+        out_features(int): The number of output units.
+        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
+            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
+        has_bias(bool): whether to add bias.
+        input_is_parallel(bool): whether the input has alreadly been splitted across the mp group.
+        fuse_matmul_bias(bool): whether to fuse matmul and bias.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=True,
+                 input_is_parallel=False,
+                 fuse_matmul_bias=False,
+                 mp_group=None,
+                 name=None):
+        super(RowParallelLinear, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.input_is_parallel = input_is_parallel
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+        self._name = name
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+        self.is_mp = (self.world_size > 1)
+        assert in_features % self.world_size == 0, (
+            "Number of row of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(
+                in_features, self.world_size))
+
+        self.input_size_per_partition = in_features // self.world_size
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[self.input_size_per_partition, self.out_features],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[self.input_size_per_partition, self.out_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+        if has_bias:
+            self.bias = self.create_parameter(
+                shape=[self.out_features],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.bias = None
+
+        self.linear = F.linear
+
+        if fuse_matmul_bias:
+            if not is_fused_matmul_bias_supported():
+                raise NotImplementedError(
+                    "You set fuse_matmul_bias=True in RowParallelLinear, "
+                    "however, the paddle you are using not support this operation. "
+                    "Please set fuse_matmul_bias=False or use paddle compiled "
+                    "with cuda 11.6 or higher.")
+            from paddle.incubate.nn.functional import fused_linear
+            self.linear = fused_linear
+
+    def forward(self, x):
+        if self.input_is_parallel or (not self.is_mp):
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = mp_ops._c_split(x, group=self.model_parallel_group)
+
+        if self.is_mp:
+            output_parallel = self.linear(input_parallel,
+                                          self.weight,
+                                          name=self._name)
+            output_ = mp_ops._mp_allreduce(output_parallel,
+                                           group=self.model_parallel_group,
+                                           use_calc_stream=True,
+                                           use_model_parallel=True)
+            output = output_ + self.bias if self.bias is not None else output_
+        else:
+            output = self.linear(input_parallel,
+                                 self.weight,
+                                 self.bias,
+                                 name=self._name)
+
+        return output
+
+
+class ParallelCrossEntropy(Layer):
+    """CrossEntropy with mp parallelized.
+    this class is used for splitting softmax cross entropy in mp group.
+
+    Args:
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        loss_func = ParallelCrossEntropy()
+        loss = loss_func(img, lable)
+    """
+
+    def __init__(self, mp_group=None, name=None):
+        super(ParallelCrossEntropy, self).__init__()
+        self.name = name
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+    def forward(self, input, label):
+        loss = mp_ops._c_softmax_with_cross_entropy(
+            input, label, group=self.model_parallel_group)
+        return loss
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
new file mode 100644
index 0000000000000..dc4dc05c7ba41
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -0,0 +1,772 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid import core
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _in_legacy_dygraph
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.dygraph import layers
+from paddle.distributed import collective
+from ....communication.comm_utils import ReduceOp
+from paddle.fluid.data_feeder import check_dtype
+import paddle.fluid.dygraph_utils as dygraph_utils
+
+
+def _c_identity(tensor, group=None):
+    """
+    Return a copy of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if _non_static_mode():
+        return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
+                                        'ring_id', ring_id,
+                                        'use_model_parallel', True)
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                     })
+    return out
+
+
+def _c_concat(tensor, group=None):
+    """
+    Return allgather of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    group = collective._get_default_group() if group is None else group
+    ring_id = group.id
+
+    global_rank = collective._get_global_env().rank
+    rank = group.rank
+    nranks = group.nranks
+
+    if _non_static_mode():
+        return _legacy_C_ops.c_concat(tensor, 'ring_id', ring_id,
+                                      'use_calc_stream', True, 'rank', rank,
+                                      'nranks', nranks, 'use_model_parallel',
+                                      True)
+
+    op_type = 'c_concat'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_concat')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                         'nranks': nranks,
+                         'rank': rank
+                     })
+    return out
+
+
+def _c_split(tensor, group=None):
+    """
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    global_rank = collective._get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = collective._get_global_env(
+    ).world_size if group is None else group.nranks
+
+    if _non_static_mode():
+        return _legacy_C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                     ring_id, 'rank', rank, 'nranks', nranks,
+                                     'use_model_parallel', True)
+
+    op_type = 'c_split'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'rank': rank,
+                         'nranks': nranks,
+                         'use_model_parallel': True,
+                     })
+    return out
+
+
+def _mp_allreduce(tensor,
+                  op=ReduceOp.SUM,
+                  group=None,
+                  use_calc_stream=True,
+                  use_model_parallel=True):
+    """[it is same as allreduce above, but it supports model parallel. And it support inplace startegy]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        group = collective._get_default_group() if group is None else group
+        assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op)
+
+        from paddle.autograd import PyLayer
+
+        class mp_allreduce_eager(PyLayer):
+
+            @staticmethod
+            def forward(ctx, tensor, group, use_calc_stream,
+                        use_model_parallel):
+                ctx.ring_id = group.id
+
+                if use_calc_stream:
+                    op_type = collective._get_reduce_op(op, "_mp_allreduce")
+                    group.process_group.allreduce_on_calc_stream(
+                        tensor, op_type)
+                    return tensor
+                else:
+                    return _legacy_C_ops.c_allreduce_sum_(
+                        tensor, 'use_calc_stream', use_calc_stream, 'ring_id',
+                        ring_id, "use_model_parallel", use_model_parallel)
+
+            @staticmethod
+            def backward(ctx, dy):
+                return _legacy_C_ops.c_identity(dy, 'use_calc_stream', True,
+                                                'ring_id', ctx.ring_id,
+                                                'use_model_parallel', True)
+
+        return mp_allreduce_eager.apply(tensor, group, use_calc_stream,
+                                        use_model_parallel)
+
+    ring_id = 0 if group is None else group.id
+    if _in_legacy_dygraph():
+        if op == ReduceOp.SUM:
+            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id, "use_model_parallel",
+                                                  use_model_parallel)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    op_type = 'c_allreduce_sum'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream,
+                         'use_model_parallel': use_model_parallel,
+                     })
+    return out
+
+
+def _c_lookup_table(table, index, start_index=0, name=None):
+    """
+    Lookup table according to index.
+
+    Args:
+        table (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64.
+        index (Tensor): The index to lookup table.
+        start_index (int): The initial index for table range.
+        name (string): The name of the api
+
+    Returns:
+        Tensor.
+    """
+    if _non_static_mode():
+        return _legacy_C_ops.c_embedding(table, index, "start_index",
+                                         start_index)
+
+    op_type = 'c_embedding'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name='table')
+    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='c_embedding',
+                     inputs={
+                         'Ids': index,
+                         'W': table
+                     },
+                     outputs={'Out': tmp},
+                     attrs={"start_index": start_index})
+    return tmp
+
+
+class _Linear(layers.Layer):
+    """
+    Linear
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(_Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(shape=[in_features, out_features],
+                                            attr=self._weight_attr,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[out_features],
+                                          attr=self._bias_attr,
+                                          dtype=self._dtype,
+                                          is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = _linear(x=input,
+                      weight=self.weight,
+                      bias=self.bias,
+                      name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'in_features={}, out_features={}, dtype={}{}'.format(
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+
+
+def _c_softmax_with_cross_entropy(logits,
+                                  label,
+                                  group=None,
+                                  return_softmax=False):
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+    global_rank = collective._get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = collective._get_global_env(
+    ).world_size if group is None else group.nranks
+
+    input_dims = len(list(logits.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=-1)
+
+    if _non_static_mode():
+        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
+            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'ring_id': ring_id,
+        'rank': rank,
+        'nranks': nranks,
+    }
+    helper = LayerHelper('c_softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(type='c_softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': logits,
+                         'Label': label
+                     },
+                     outputs={
+                         'Softmax': softmax,
+                         'Loss': loss
+                     },
+                     attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def _linear(x, weight, bias=None, name=None):
+    """
+    Fuction Linear
+    """
+    if _non_static_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        _legacy_C_ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                             'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(pre_bias,
+                                                     bias,
+                                                     axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+        assert len(
+            x.shape) < 4, "X latitude is not supported greater than 3 now."
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(type='matmul_v2',
+                         inputs=inputs,
+                         outputs={'Out': tmp},
+                         attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(type='elementwise_add',
+                             inputs={
+                                 'X': [tmp],
+                                 'Y': [bias]
+                             },
+                             outputs={'Out': [res]},
+                             attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=None):
+    """
+    Parallel Linear
+
+    axis the dimension of the parameter of linear layer.
+    axis = 0: the row dimension
+    axis = 1: the col dimension
+
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, group=group)
+    else:
+        x = _c_identity(x, group=group)
+
+    linear = paddle.nn.Linear(num_rows,
+                              num_cols,
+                              weight_attr=param_attr,
+                              bias_attr=bias_attr,
+                              name=name)
+
+    # NOTE: npu linear function use matmul_v2 but linear use matmul
+    linear_function = _linear if core.is_compiled_with_npu()\
+        else paddle.nn.functional.linear
+    linear_out = linear_function(
+        x,
+        linear.weight,
+        # NOTE(wangxi): row split, bias need add after allreduce
+        None if axis == 0 else linear.bias,
+        linear.name)
+
+    _set_var_distributed(linear.weight)
+    # set is_distributed for splited bias
+    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
+    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+    if axis == 1 and linear._bias_attr != False:
+        _set_var_distributed(linear.bias)
+
+    if not gather_out: return linear_out
+
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    main_block = paddle.static.default_main_program().current_block()
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(type='c_allreduce_sum',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'ring_id': ring_id,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
+        if linear.bias is not None:
+            out = out + linear.bias
+    else:
+        main_block.append_op(type='c_concat',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'rank': inner_rank,
+                                 'ring_id': ring_id,
+                                 'nranks': nranks,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
+    return out
+
+
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=None):
+    """
+    Parallel Embedding
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    helper = LayerHelper("_parallel_embedding", **locals())
+
+    per_part_size = per_part_embeddings
+    rank = inner_rank
+
+    vocab_start_index = rank * per_part_size
+    dtype = helper.get_default_dtype()
+    size = [per_part_size, origin_size[1]]
+
+    weight = helper.create_parameter(attr=param_attr,
+                                     shape=size,
+                                     dtype=dtype,
+                                     is_bias=False)
+
+    if num_partitions == 1:
+        return paddle.nn.functional.embedding(x,
+                                              weight=weight,
+                                              padding_idx=None,
+                                              sparse=False,
+                                              name=name)
+
+    startup_block = paddle.static.default_startup_program().global_block()
+    main_block = paddle.static.default_main_program().global_block()
+    startup_block.vars[weight.name].is_distributed = True
+    main_block.vars[weight.name].is_distributed = True
+
+    output_parallel = _c_lookup_table(weight,
+                                      x,
+                                      start_index=vocab_start_index,
+                                      name=name)
+    out = _mp_allreduce(output_parallel,
+                        group=group,
+                        use_calc_stream=True,
+                        use_model_parallel=True)
+    return out
+
+
+def split(x,
+          size,
+          operation,
+          axis=0,
+          num_partitions=1,
+          gather_out=True,
+          weight_attr=None,
+          bias_attr=None,
+          name=None):
+    """
+
+    Split the weight of the specified operation into multiple devices
+    and do the computation in parallel.
+
+    Now the following three cases are supported.
+
+    Case 1: Parallel Embedding
+        The weight of the embedding operation is a NxM matrix with N rows and M columns.
+        With parallel embedding, the weight is split into num_partitions partitions, each
+        of which is a matrix with (N/num_partitions + 1) rows and M column where the last
+        row as the padding idx.
+
+        Suppose we split the NxM weight into two partitons on device_0 and device_1
+        respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
+        index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
+        keep unchanged and all other values are changed to N/2 which is the padding index and
+        are mapped to all zeros after embedding. In the same way, on device_1, the value V in the
+        input within [N/2, N-1] will be changed to (V - N/2), and all other values are changed
+        to N/2 and are mapped to all zeros after embedding. Finally, the results on the two
+        devices are sum-reduced.
+
+        The Embedding put on single card is as shown below:
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_single.png
+            :width: 800
+            :height: 350
+            :alt: single_embedding
+            :align: center
+
+        Parallel Embedding is shown as below:
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_split.png
+            :width: 800
+            :alt: split_embedding
+            :align: center
+
+    Case 2: Row Parallel Linear
+        The weight of the linear operation is a NxM matrix with N rows and M columns.
+        With row parallel linear, the weight is split into num_partitions partitions, each
+        of which is a matrix with N/num_partitions rows and M column.
+
+        The linear layer put on single card is shown as below, the input variable is represented by X,
+        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
+        simple matrix multiplication operation, O = X * W.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
+            :width: 800
+            :alt: single_linear
+            :align: center
+
+        Row Parallel Linear is shown as below. As the name suggests, Row Parallel Linear splits the weight matrix W into
+        [[W_row1], [W_row2]] along the row. And accordingly the input is splitted along the column into [X_col1, X_col2] and multiply their
+        respective weight matrices. Finally apply AllReduce on the output from each card to get the final output.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_row.png
+            :width: 800
+            :alt: split_row
+            :align: center
+
+    Case 3: Column Parallel Linear
+        The weight of the linear operation is a NxM matrix with N rows and M columns.
+        With column parallel linear, the weight is split into num_paratitions partitions, each
+        of which is a matrix with N rows and M/num_partitions column.
+
+        The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
+        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
+        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
+            :width: 800
+            :alt: split_col
+            :align: center
+
+    As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
+    operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col_row.png
+            :width: 800
+            :alt: split_col_row
+            :align: center
+
+    Args:
+        x (Tensor): Input tensor. It's data type should be float16, float32, float64, int32 or int64.
+        size (list|tuple): A list or tuple with two elements indicating the shape of the weight.
+        operation (str): The name of the operation. The supported operations are 'linear' and 'embedding'.
+        axis (int, Optional): Indicate along which axis to split the weight. Default: 0.
+        num_partitions (int, Optional): How many parts the weight is partitioned. Default: 1.
+        gather_out (bool, Optional): Whether to gather the output after computation. By default, the output
+            on each partitions will be gathered after computation. Default: True.
+        weight_attr (ParamAttr, Optional): The parameter attribute for the learnable
+            weights(Parameter) of the specified operation. Default: None.
+        bias_attr (ParamAttr, Optional): The parameter attribute for the bias
+            of the specified operation. Default: None.
+        name (str, Optional): The default value is None. Normally there is no need for user to set this
+            property. Default: None. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed.fleet as fleet
+
+            paddle.enable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            fleet.init(is_collective=True)
+            data = paddle.randint(0, 8, shape=[10,4])
+            emb_out = paddle.distributed.split(
+                data,
+                (8, 8),
+                operation="embedding",
+                num_partitions=2)
+
+    """
+    assert isinstance(
+        size,
+        (list, tuple)), ("The type of size for "
+                         "paddle.distributed.split must be list or tuple.")
+    assert len(size) == 2, ("Number of elements in size of "
+                            "paddle.distributed.split must be two.")
+    assert isinstance(operation, str), ("The type of operation for "
+                                        "paddle.distributed.split must be str.")
+    supported_operations = [
+        'linear',
+        'embedding',
+    ]
+    assert operation in supported_operations, (
+        "The operation for "
+        "paddle.distributed.split must be one of {}.".format(
+            supported_operations))
+    if _non_static_mode():
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
+    else:
+        from paddle.distributed.fleet import fleet
+        assert fleet._role_maker, ("To use paddle.distributed.split, "
+                                   "you must call fleet.init() firstly.")
+        rank = fleet.worker_index()
+        nranks = fleet.worker_num()
+
+    # rank within a model parallel group
+    inner_rank = rank % num_partitions
+
+    if operation == "embedding":
+        assert axis == 0, ("We only support to split the weight of embedding "
+                           "along the first axis now.")
+        assert size[0] % num_partitions == 0, \
+            "The length of the vocabulary must be divisible by num_partitions " \
+            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
+
+        per_part_size = size[0] // num_partitions
+        emb_out = _parallel_embedding(x,
+                                      per_part_size,
+                                      size,
+                                      weight_attr,
+                                      inner_rank,
+                                      num_partitions,
+                                      name,
+                                      group=None)
+        return emb_out
+    else:
+        should_split = False
+        if axis == 0:
+            assert size[0] % num_partitions == 0, (
+                "Number of rows of the weight for linear ({}) must be"
+                " divisible by num_partitions ({})".format(
+                    size[0], num_partitions))
+            per_part_size = size[0] // num_partitions
+            linear_size = (per_part_size, size[1])
+            if x.shape[-1] == size[0]: should_split = True
+
+        elif axis == 1:
+            assert size[1] % num_partitions == 0, (
+                "Number of column of the weight for linear ({}) must be"
+                " divisible by num_partitions ({})".format(
+                    size[1], num_partitions))
+            per_part_size = size[1] // num_partitions
+            linear_size = (size[0], per_part_size)
+        else:
+            raise ValueError("The value of axis must be 0 or 1, but the value "
+                             "given is {}.".format(axis))
+
+        linear_out = _parallel_linear(x,
+                                      linear_size[0],
+                                      linear_size[1],
+                                      axis,
+                                      weight_attr,
+                                      bias_attr,
+                                      gather_out,
+                                      inner_rank,
+                                      num_partitions,
+                                      should_split,
+                                      name=name,
+                                      group=None)
+        return linear_out
diff --git a/python/paddle/distributed/fleet/layers/mpu/random.py b/python/paddle/distributed/fleet/layers/mpu/random.py
new file mode 100644
index 0000000000000..7577be6253cbf
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -0,0 +1,243 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import contextlib
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid import core
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
+from paddle.fluid.layer_helper import LayerHelper
+
+__all__ = []
+
+MODEL_PARALLEL_RNG = 'model_parallel_rng'
+
+# This file is inspired by Megatron to control random states for MP:
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/random.py
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def reset(self):
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError('state {} already exists'.format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    def get_states_tracker(self):
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states_tracker(self, states):
+        self.states_ = states
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError('state {} does not exist'.format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
+
+
+def model_parallel_random_seed(seed=None):
+    import paddle.distributed.fleet as fleet
+    hcg = fleet.get_hybrid_communicate_group()
+    rank = hcg.get_model_parallel_rank()
+
+    if seed:
+        global_seed = seed
+        local_seed = seed * 1024 + rank * 100
+    else:
+        global_seed = np.random.randint(0, 655350)
+        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
+
+    RNG_STATE_TRACKER.reset()
+    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
+    paddle.seed(global_seed)
+
+
+def determinate_seed(rng_name):
+    assert rng_name is not None and rng_name != ""
+    helper = LayerHelper('seed', **locals())
+    out = helper.create_variable_for_type_inference(dtype=paddle.int32)
+    # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
+    helper.append_op(type='seed',
+                     outputs={'Out': out},
+                     attrs={
+                         'deterministic': True,
+                         'rng_name': rng_name,
+                         'force_cpu': True
+                     })
+    return out
+
+
+def dropout(x,
+            p=0.5,
+            axis=None,
+            rng_name=None,
+            training=True,
+            mode="upscale_in_train",
+            name=None):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training. The dropout operator randomly sets the
+    outputs of some units to zero, while upscale others according to the given
+    dropout probability.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float|int): Probability of setting units to zero. Default 0.5.
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
+        rng_name (str): The random seed generator name, which used to obtain deterministic results.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
+
+                           1. upscale_in_train(default), upscale the output at training time
+
+                              - train: out = input * mask / ( 1.0 - dropout_prob )
+                              - inference: out = input
+
+                           2. downscale_in_infer, downscale the output at inference
+
+                              - train: out = input * mask
+                              - inference: out = input * (1.0 - dropout_prob)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x` .
+
+
+    Examples:
+        We use ``p=0.5`` in the following description for simplicity.
+
+        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
+
+        ..  code-block:: text
+
+            Let's see a simple case when x is a 2d tensor with shape 2*3:
+            [[1 2 3]
+             [4 5 6]]
+            we generate mask with the same shape as x, which is 2*3. The value of mask is
+            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
+            [[0 1 0]
+             [1 0 1]]
+            So the output is obtained from elementwise multiply of x and mask:
+            [[0 2 0]
+             [4 0 6]]
+            Using default setting, i.e. ``mode='upscale_in_train'`` ,
+            if in training phase, the final upscale output is:
+            [[0 4 0 ]
+             [8 0 12]]
+            if in test phase, the output is the same as input:
+            [[1 2 3]
+             [4 5 6]]
+            we can also set ``mode='downscale_in_infer'`` , then
+            if in training phase, the final output is:
+            [[0 2 0]
+             [4 0 6]]
+            if in test phase, the scale output is:
+            [[0.5 1.  1.5]
+             [2.  2.5 3. ]]
+
+    """
+    if rng_name is None:
+        return paddle.nn.functional.dropout(x, p, axis, training, mode, name)
+
+    if not isinstance(p, (float, int, Variable)):
+        raise TypeError("p argument should be a number(int|float) or Variable")
+
+    # fast return for p == 0
+    if isinstance(p, (int, float)) and p == 0: return x
+
+    assert 0 <= p <= 1, ValueError("p argument should between 0 and 1")
+    assert mode in ('downscale_in_infer', 'upscale_in_train'), \
+        ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+
+    assert axis is None, \
+        TypeError("unsupport axis when using random seed generator")
+
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    # dygraph using tracker, doesn't need determinate seed
+    if _non_static_mode():
+        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
+                                          not training, 'fix_seed', False,
+                                          'seed', 0, 'dropout_implementation',
+                                          mode)
+        return out
+
+    seed = determinate_seed(rng_name)
+
+    if isinstance(p, Variable) and not p.shape != [1]:
+        raise TypeError(
+            "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
+            .format(p.shape))
+
+    helper = LayerHelper('dropout', **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'dropout')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    mask = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+    helper.append_op(type='dropout',
+                     inputs={
+                         'X': [x],
+                         'Seed': seed
+                     },
+                     outputs={
+                         'Out': [out],
+                         'Mask': [mask]
+                     },
+                     attrs={
+                         'dropout_prob': p,
+                         'is_test': not training,
+                         'dropout_implementation': mode,
+                     })
+    return out
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 6cb69bc73ce61..66a1c87756220 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,298 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-from paddle.fluid import core
-from paddle.fluid.dygraph.layers import Layer
-from .random import get_rng_state_tracker
-from paddle.nn import functional as F
-from paddle import framework
-from ...base import topology as tp
-from paddle.autograd import PyLayer
+from ...layers.mpu.mp_layers import VocabParallelEmbedding  # noqa: F401
+from ...layers.mpu.mp_layers import ColumnParallelLinear  # noqa: F401
+from ...layers.mpu.mp_layers import RowParallelLinear  # noqa: F401
+from ...layers.mpu.mp_layers import ParallelCrossEntropy  # noqa: F401
 
 __all__ = []
-
-# Follow this paper to achieve the file:
-# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter
-# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
-
-
-def is_fused_matmul_bias_supported():
-    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
-        return hasattr(core.ops, 'fused_gemm_epilogue')
-    else:
-        return False
-
-
-class VocabParallelEmbedding(Layer):
-
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 weight_attr=None,
-                 mp_group=None,
-                 name=None):
-        super(VocabParallelEmbedding, self).__init__()
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-        self.origin_num_embeddings = num_embeddings
-        self.is_mp = (self.world_size > 1)
-
-        assert num_embeddings % self.world_size == 0, (
-            "The length of the vocabulary must be divisible by the parallelism degree of MP"
-        )
-
-        per_part_size = num_embeddings // self.world_size
-
-        self.vocab_start_index = self.rank * per_part_size
-        self._dtype = self._helper.get_default_dtype()
-        self._size = [per_part_size, embedding_dim]
-        self._weight_attr = weight_attr
-        self._name = name
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(attr=self._weight_attr,
-                                                    shape=self._size,
-                                                    dtype=self._dtype,
-                                                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(attr=self._weight_attr,
-                                                shape=self._size,
-                                                dtype=self._dtype,
-                                                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-    def forward(self, x):
-        if self.is_mp:
-            output_parallel = paddle.distributed.collective._c_lookup_table(
-                self.weight,
-                x,
-                start_index=self.vocab_start_index,
-                name=self._name)
-            output = paddle.distributed.collective._mp_allreduce(
-                output_parallel,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
-        else:
-            output = F.embedding(x,
-                                 weight=self.weight,
-                                 padding_idx=None,
-                                 sparse=False,
-                                 name=self._name)
-        return output
-
-
-class ColumnParallelLinear(Layer):
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 has_bias=None,
-                 gather_output=True,
-                 fuse_matmul_bias=False,
-                 mp_group=None,
-                 name=None):
-        super(ColumnParallelLinear, self).__init__()
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self._name = name
-        self.is_mp = (self.world_size > 1)
-
-        self.gather_output = gather_output
-        assert out_features % self.world_size == 0, (
-            "Number of column of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                out_features, self.world_size))
-        self.output_size_per_partition = out_features // self.world_size
-
-        self._weight_attr = weight_attr
-        self._dtype = self._helper.get_default_dtype()
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(
-                    shape=[in_features, self.output_size_per_partition],
-                    attr=self._weight_attr,
-                    dtype=self._dtype,
-                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(
-                shape=[in_features, self.output_size_per_partition],
-                attr=self._weight_attr,
-                dtype=self._dtype,
-                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-        if has_bias:
-            # initialize bias to zero like Megatron
-            self.bias = self.create_parameter(
-                shape=[self.output_size_per_partition],
-                attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype,
-                is_bias=True)
-            self.bias.is_distributed = True if self.is_mp else False
-        else:
-            self.bias = None
-
-        self.linear = F.linear
-
-        if fuse_matmul_bias:
-            if not is_fused_matmul_bias_supported():
-                raise NotImplementedError(
-                    "You set fuse_matmul_bias=True in ColumnParallelLinear, "
-                    "however, the paddle you are using not support this operation. "
-                    "Please set fuse_matmul_bias=False or use paddle compiled "
-                    "with cuda 11.6 or higher.")
-            from paddle.incubate.nn.functional import fused_linear
-            self.linear = fused_linear
-
-    def forward(self, x):
-        # use inner api to process identity
-        if self.is_mp:
-            input_parallel = paddle.distributed.collective._c_identity(
-                x, group=self.model_parallel_group)
-        else:
-            input_parallel = x
-
-        output_parallel = self.linear(input_parallel,
-                                      self.weight,
-                                      self.bias,
-                                      name=self._name)
-
-        if self.gather_output and self.is_mp:
-            output = paddle.distributed.collective._c_concat(
-                output_parallel, group=self.model_parallel_group)
-        else:
-            output = output_parallel
-        return output
-
-
-class RowParallelLinear(Layer):
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 has_bias=True,
-                 input_is_parallel=False,
-                 fuse_matmul_bias=False,
-                 mp_group=None,
-                 name=None):
-        super(RowParallelLinear, self).__init__()
-
-        self.in_features = in_features
-        self.out_features = out_features
-        self.input_is_parallel = input_is_parallel
-        self._weight_attr = weight_attr
-        self._dtype = self._helper.get_default_dtype()
-        self._name = name
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-        self.is_mp = (self.world_size > 1)
-        assert in_features % self.world_size == 0, (
-            "Number of row of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                in_features, self.world_size))
-
-        self.input_size_per_partition = in_features // self.world_size
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(
-                    shape=[self.input_size_per_partition, self.out_features],
-                    attr=self._weight_attr,
-                    dtype=self._dtype,
-                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(
-                shape=[self.input_size_per_partition, self.out_features],
-                attr=self._weight_attr,
-                dtype=self._dtype,
-                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-        if has_bias:
-            self.bias = self.create_parameter(
-                shape=[self.out_features],
-                attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self.bias = None
-
-        self.linear = F.linear
-
-        if fuse_matmul_bias:
-            if not is_fused_matmul_bias_supported():
-                raise NotImplementedError(
-                    "You set fuse_matmul_bias=True in RowParallelLinear, "
-                    "however, the paddle you are using not support this operation. "
-                    "Please set fuse_matmul_bias=False or use paddle compiled "
-                    "with cuda 11.6 or higher.")
-            from paddle.incubate.nn.functional import fused_linear
-            self.linear = fused_linear
-
-    def forward(self, x):
-        if self.input_is_parallel or (not self.is_mp):
-            input_parallel = x
-        else:
-            # split last dim
-            input_parallel = paddle.distributed.collective._c_split(
-                x, group=self.model_parallel_group)
-
-        if self.is_mp:
-            output_parallel = self.linear(input_parallel,
-                                          self.weight,
-                                          name=self._name)
-            output_ = paddle.distributed.collective._mp_allreduce(
-                output_parallel,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
-            output = output_ + self.bias if self.bias is not None else output_
-        else:
-            output = self.linear(input_parallel,
-                                 self.weight,
-                                 self.bias,
-                                 name=self._name)
-
-        return output
-
-
-class ParallelCrossEntropy(Layer):
-
-    def __init__(self, mp_group=None, name=None):
-        super(ParallelCrossEntropy, self).__init__()
-        self.name = name
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-    def forward(self, input, label):
-        loss = paddle.distributed.collective._c_softmax_with_cross_entropy(
-            input, label, group=self.model_parallel_group)
-        return loss
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 900c0f79798fc..9deed30db66f5 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,232 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import contextlib
-import numpy as np
-from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid import core
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
-from paddle.fluid.layer_helper import LayerHelper
+from ...layers.mpu.random import RNGStatesTracker  # noqa: F401
+from ...layers.mpu.random import get_rng_state_tracker  # noqa: F401
+from ...layers.mpu.random import model_parallel_random_seed  # noqa: F401
+from ...layers.mpu.random import determinate_seed  # noqa: F401
+from ...layers.mpu.random import dropout  # noqa: F401
 
 __all__ = []
-
-MODEL_PARALLEL_RNG = 'model_parallel_rng'
-
-# This file is inspired by Megatron to control random states for MP:
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/random.py
-
-
-class RNGStatesTracker:
-    """
-    Tracker the RNG states.
-    """
-
-    def __init__(self):
-        # Map from name to the rng state.
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def reset(self):
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def add(self, name, seed):
-        if seed in self.seeds_:
-            raise ValueError('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        if name in self.states_:
-            raise ValueError('state {} already exists'.format(name))
-        orig_rng_state = paddle.get_cuda_rng_state()
-        paddle.seed(seed)
-        self.states_[name] = paddle.get_cuda_rng_state()
-        paddle.set_cuda_rng_state(orig_rng_state)
-
-    def get_states_tracker(self):
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states_tracker(self, states):
-        self.states_ = states
-
-    @contextlib.contextmanager
-    def rng_state(self, name=MODEL_PARALLEL_RNG):
-        if name not in self.states_:
-            raise ValueError('state {} does not exist'.format(name))
-        orig_cuda_rng_state = paddle.get_cuda_rng_state()
-        paddle.set_cuda_rng_state(self.states_[name])
-        try:
-            yield
-        finally:
-            self.states_[name] = paddle.get_cuda_rng_state()
-            paddle.set_cuda_rng_state(orig_cuda_rng_state)
-
-
-RNG_STATE_TRACKER = RNGStatesTracker()
-
-
-def get_rng_state_tracker():
-    return RNG_STATE_TRACKER
-
-
-def model_parallel_random_seed(seed=None):
-    import paddle.distributed.fleet as fleet
-    hcg = fleet.get_hybrid_communicate_group()
-    rank = hcg.get_model_parallel_rank()
-
-    if seed:
-        global_seed = seed
-        local_seed = seed * 1024 + rank * 100
-    else:
-        global_seed = np.random.randint(0, 655350)
-        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
-
-    RNG_STATE_TRACKER.reset()
-    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
-    paddle.seed(global_seed)
-
-
-def determinate_seed(rng_name):
-    assert rng_name is not None and rng_name != ""
-    helper = LayerHelper('seed', **locals())
-    out = helper.create_variable_for_type_inference(dtype=paddle.int32)
-    # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
-    helper.append_op(type='seed',
-                     outputs={'Out': out},
-                     attrs={
-                         'deterministic': True,
-                         'rng_name': rng_name,
-                         'force_cpu': True
-                     })
-    return out
-
-
-def dropout(x,
-            p=0.5,
-            axis=None,
-            rng_name=None,
-            training=True,
-            mode="upscale_in_train",
-            name=None):
-    """
-    Dropout is a regularization technique for reducing overfitting by preventing
-    neuron co-adaption during training. The dropout operator randomly sets the
-    outputs of some units to zero, while upscale others according to the given
-    dropout probability.
-
-    Args:
-        x (Tensor): The input tensor. The data type is float32 or float64.
-        p (float|int): Probability of setting units to zero. Default 0.5.
-        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
-        rng_name (str): The random seed generator name, which used to obtain deterministic results.
-        training (bool): A flag indicating whether it is in train phrase or not. Default True.
-        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
-
-                           1. upscale_in_train(default), upscale the output at training time
-
-                              - train: out = input * mask / ( 1.0 - dropout_prob )
-                              - inference: out = input
-
-                           2. downscale_in_infer, downscale the output at inference
-
-                              - train: out = input * mask
-                              - inference: out = input * (1.0 - dropout_prob)
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Tensor representing the dropout, has same shape and data type as `x` .
-
-
-    Examples:
-        We use ``p=0.5`` in the following description for simplicity.
-
-        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
-
-        ..  code-block:: text
-
-            Let's see a simple case when x is a 2d tensor with shape 2*3:
-            [[1 2 3]
-             [4 5 6]]
-            we generate mask with the same shape as x, which is 2*3. The value of mask is
-            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
-            [[0 1 0]
-             [1 0 1]]
-            So the output is obtained from elementwise multiply of x and mask:
-            [[0 2 0]
-             [4 0 6]]
-            Using default setting, i.e. ``mode='upscale_in_train'`` ,
-            if in training phase, the final upscale output is:
-            [[0 4 0 ]
-             [8 0 12]]
-            if in test phase, the output is the same as input:
-            [[1 2 3]
-             [4 5 6]]
-            we can also set ``mode='downscale_in_infer'`` , then
-            if in training phase, the final output is:
-            [[0 2 0]
-             [4 0 6]]
-            if in test phase, the scale output is:
-            [[0.5 1.  1.5]
-             [2.  2.5 3. ]]
-
-    """
-    if rng_name is None:
-        return paddle.nn.functional.dropout(x, p, axis, training, mode, name)
-
-    if not isinstance(p, (float, int, Variable)):
-        raise TypeError("p argument should be a number(int|float) or Variable")
-
-    # fast return for p == 0
-    if isinstance(p, (int, float)) and p == 0: return x
-
-    assert 0 <= p <= 1, ValueError("p argument should between 0 and 1")
-    assert mode in ('downscale_in_infer', 'upscale_in_train'), \
-        ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
-
-    assert axis is None, \
-        TypeError("unsupport axis when using random seed generator")
-
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
-
-    # dygraph using tracker, doesn't need determinate seed
-    if _non_static_mode():
-        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                          not training, 'fix_seed', False,
-                                          'seed', 0, 'dropout_implementation',
-                                          mode)
-        return out
-
-    seed = determinate_seed(rng_name)
-
-    if isinstance(p, Variable) and not p.shape != [1]:
-        raise TypeError(
-            "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
-            .format(p.shape))
-
-    helper = LayerHelper('dropout', **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'dropout')
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    mask = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
-
-    helper.append_op(type='dropout',
-                     inputs={
-                         'X': [x],
-                         'Seed': seed
-                     },
-                     outputs={
-                         'Out': [out],
-                         'Mask': [mask]
-                     },
-                     attrs={
-                         'dropout_prob': p,
-                         'is_test': not training,
-                         'dropout_implementation': mode,
-                     })
-    return out
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index 9495ffa22b0c6..44f504887cf16 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -174,7 +174,7 @@ def _analyze_program(self):
 
     def _could_be_prune(self):
 
-        return self.dist_context._gradient_scale and (
+        return self.dist_context.gradient_scale and (
             self._support_rescale_grad or self._all_dp_groups_same_degree())
 
     def _all_dp_groups_same_degree(self):
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 89ff2019d7392..64562668a42ac 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -380,6 +380,10 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
             # create cast grad
             grad_slot_name = slot_name + "@GRAD"
             assert grad_slot_name in op.output_names
+            if len(op.output(grad_slot_name)) == 0:
+                var = block.var(src_name)
+                assert var.stop_gradient is True
+                continue
             assert len(op.output(grad_slot_name)) == 1
             grad_name = op.output(grad_slot_name)[0]
             grad = block.var(grad_name)
diff --git a/python/paddle/distributed/utils/__init__.py b/python/paddle/distributed/utils/__init__.py
index 12e0ede6bd018..4ce89fa36b06b 100644
--- a/python/paddle/distributed/utils/__init__.py
+++ b/python/paddle/distributed/utils/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['global_scatter', 'global_gather']
-from moe_utils import global_gather, global_scatter
+__all__ = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 6557a0aec0101..f8de55ee3caea 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -448,13 +448,14 @@ def _optimize_fp32_graph(self, graph):
             graph = self._apply_pass(graph, 'fc_act_mkldnn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'matmul_transpose_reshape_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph,
+                                 'matmul_elementwise_add_mkldnn_fuse_pass')
+        graph = self._apply_pass(graph, 'matmul_activation_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'batch_norm_act_fuse_pass')
         graph = self._apply_pass(graph, 'softplus_activation_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'scale_matmul_fuse_pass')
         graph = self._apply_pass(graph,
                                  'reshape_transpose_matmul_mkldnn_fuse_pass')
-        graph = self._apply_pass(graph,
-                                 'matmul_elementwise_add_mkldnn_fuse_pass')
         # the following pass should be the last one since it will work on all fused ops.
         graph = self._apply_pass(graph, 'runtime_context_cache_pass')
         return graph
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 1fa3c769d77fb..09a659ff5730a 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -24,15 +24,10 @@
 if os.name == 'nt':
     core_suffix = 'pyd'
 
-has_avx_core = False
-has_noavx_core = False
-
+has_libpaddle_so = False
 current_path = os.path.abspath(os.path.dirname(__file__))
-if os.path.exists(current_path + os.sep + 'core_avx.' + core_suffix):
-    has_avx_core = True
-
-if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
-    has_noavx_core = True
+if os.path.exists(current_path + os.sep + 'libpaddle.' + core_suffix):
+    has_libpaddle_so = True
 
 try:
     if os.name == 'nt':
@@ -198,10 +193,8 @@ def load_dso(dso_absolute_path):
 
 
 def pre_load(dso_name):
-    if has_avx_core:
-        core_so = current_path + os.sep + 'core_avx.' + core_suffix
-    elif has_noavx_core:
-        core_so = current_path + os.sep + 'core_noavx.' + core_suffix
+    if has_libpaddle_so:
+        core_so = current_path + os.sep + 'libpaddle.' + core_suffix
     else:
         core_so = None
     dso_path = get_dso_path(core_so, dso_name)
@@ -239,7 +232,7 @@ def to_list(s):
 # (1) the number of dynamic shared librarys (DSO) loaded > 14,
 # (2) after that, load a dynamic shared library (DSO) with static TLS.
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
-# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
+# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'libpaddle.so'.
 # The final solution is to upgrade glibc to > 2.22 on the target system.
 if platform.system().lower() == 'linux':
     libc_type, libc_ver = get_libc_ver()
@@ -247,123 +240,65 @@ def to_list(s):
         try:
             pre_load('libgomp')
         except Exception as e:
-            # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+            # NOTE(zhiqiu): do not abort if failed, since it may success when import libpaddle.so
             sys.stderr.write('Error: Can not preload libgomp.so')
 
-load_noavx = False
-
-if avx_supported():
-    try:
-        from . import core_avx
-        core_avx.LoDTensor = core_avx.Tensor
-
-        from .core_avx import *
-        from .core_avx import __doc__, __file__, __name__, __package__
-        from .core_avx import __unittest_throw_exception__
-        from .core_avx import _append_python_callable_object_and_return_id
-        from .core_avx import _cleanup, _Scope
-        from .core_avx import _get_use_default_grad_op_desc_maker_ops
-        from .core_avx import _get_all_register_op_kernels
-        from .core_avx import _is_program_version_supported
-        from .core_avx import _set_eager_deletion_mode
-        from .core_avx import _get_eager_deletion_vars
-        from .core_avx import _set_fuse_parameter_group_size
-        from .core_avx import _set_fuse_parameter_memory_size
-        from .core_avx import _is_dygraph_debug_enabled
-        from .core_avx import _dygraph_debug_level
-        from .core_avx import _switch_tracer
-        from .core_avx import _set_paddle_lib_path
-        from .core_avx import _create_loaded_parameter
-        from .core_avx import _cuda_synchronize
-        from .core_avx import _is_compiled_with_heterps
-        from .core_avx import _promote_types_if_complex_exists
-        from .core_avx import _set_cached_executor_build_strategy
-        from .core_avx import _device_synchronize
-        from .core_avx import _get_current_stream
-        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
-        from .core_avx import _set_current_stream
-        if sys.platform != 'win32':
-            from .core_avx import _set_process_pids
-            from .core_avx import _erase_process_pids
-            from .core_avx import _set_process_signal_handler
-            from .core_avx import _throw_error_if_process_failed
-            from .core_avx import _convert_to_tensor_list
-            from .core_avx import _array_to_share_memory_tensor
-            from .core_avx import _cleanup_mmap_fds
-            from .core_avx import _remove_tensor_list_mmap_fds
-    except Exception as e:
-        if has_avx_core:
-            sys.stderr.write(
-                'Error: Can not import avx core while this file exists: ' +
-                current_path + os.sep + 'core_avx.' + core_suffix + '\n')
-            raise e
-        else:
-            from .. import compat as cpt
-            sys.stderr.write(
-                "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
-                "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
-                "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
-                "to get better performance.\nThe original error is: %s\n" %
-                cpt.get_exception_message(e))
-            load_noavx = True
-else:
-    load_noavx = True
-
-if load_noavx:
-    try:
-        from . import core_noavx
-        core_noavx.LoDTensor = core_noavx.Tensor
-
-        from .core_noavx import *
-        from .core_noavx import __doc__, __file__, __name__, __package__
-        from .core_noavx import __unittest_throw_exception__
-        from .core_noavx import _append_python_callable_object_and_return_id
-        from .core_noavx import _cleanup, _Scope
-        from .core_noavx import _get_use_default_grad_op_desc_maker_ops
-        from .core_noavx import _get_all_register_op_kernels
-        from .core_noavx import _is_program_version_supported
-        from .core_noavx import _set_eager_deletion_mode
-        from .core_noavx import _get_eager_deletion_vars
-        from .core_noavx import _set_fuse_parameter_group_size
-        from .core_noavx import _set_fuse_parameter_memory_size
-        from .core_noavx import _is_dygraph_debug_enabled
-        from .core_noavx import _dygraph_debug_level
-        from .core_noavx import _switch_tracer
-        from .core_noavx import _set_paddle_lib_path
-        from .core_noavx import _create_loaded_parameter
-        from .core_noavx import _cuda_synchronize
-        from .core_noavx import _is_compiled_with_heterps
-        from .core_noavx import _promote_types_if_complex_exists
-        from .core_noavx import _set_cached_executor_build_strategy
-        from .core_noavx import _device_synchronize
-        from .core_noavx import _get_current_stream
-        from .core_noavx import _set_current_stream
-        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
-        if sys.platform != 'win32':
-            from .core_noavx import _set_process_pids
-            from .core_noavx import _erase_process_pids
-            from .core_noavx import _set_process_signal_handler
-            from .core_noavx import _throw_error_if_process_failed
-            from .core_noavx import _convert_to_tensor_list
-            from .core_noavx import _array_to_share_memory_tensor
-            from .core_noavx import _cleanup_mmap_fds
-            from .core_noavx import _remove_tensor_list_mmap_fds
-    except Exception as e:
-        if has_noavx_core:
-            sys.stderr.write(
-                'Error: Can not import noavx core while this file exists: ' +
-                current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
-        elif avx_supported():
-            sys.stderr.write(
-                "Error: The installed PaddlePaddle is incorrect. You should reinstall it by "
-                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]'\n"
-            )
-        else:
-            sys.stderr.write(
-                "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
-                "you should reinstall paddlepaddle with no-avx core.\n")
-
-        raise e
+try:
+    from . import libpaddle
+    if avx_supported() and not libpaddle.is_compiled_with_avx():
+        sys.stderr.write(
+            "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
+            "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
+            "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
+            "to get better performance.\n")
+
+    # assign tensor alias
+    libpaddle.LoDTensor = libpaddle.Tensor
+
+    from .libpaddle import *
+    from .libpaddle import __doc__, __file__, __name__, __package__
+    from .libpaddle import __unittest_throw_exception__
+    from .libpaddle import _append_python_callable_object_and_return_id
+    from .libpaddle import _cleanup, _Scope
+    from .libpaddle import _get_use_default_grad_op_desc_maker_ops
+    from .libpaddle import _get_all_register_op_kernels
+    from .libpaddle import _is_program_version_supported
+    from .libpaddle import _set_eager_deletion_mode
+    from .libpaddle import _get_eager_deletion_vars
+    from .libpaddle import _set_fuse_parameter_group_size
+    from .libpaddle import _set_fuse_parameter_memory_size
+    from .libpaddle import _is_dygraph_debug_enabled
+    from .libpaddle import _dygraph_debug_level
+    from .libpaddle import _switch_tracer
+    from .libpaddle import _set_paddle_lib_path
+    from .libpaddle import _create_loaded_parameter
+    from .libpaddle import _cuda_synchronize
+    from .libpaddle import _is_compiled_with_heterps
+    from .libpaddle import _promote_types_if_complex_exists
+    from .libpaddle import _set_cached_executor_build_strategy
+    from .libpaddle import _device_synchronize
+    from .libpaddle import _get_current_stream
+    from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
+    from .libpaddle import _set_current_stream
+    if sys.platform != 'win32':
+        from .libpaddle import _set_process_pids
+        from .libpaddle import _erase_process_pids
+        from .libpaddle import _set_process_signal_handler
+        from .libpaddle import _throw_error_if_process_failed
+        from .libpaddle import _convert_to_tensor_list
+        from .libpaddle import _array_to_share_memory_tensor
+        from .libpaddle import _cleanup_mmap_fds
+        from .libpaddle import _remove_tensor_list_mmap_fds
+except Exception as e:
+    if has_libpaddle_so:
+        sys.stderr.write(
+            'Error: Can not import paddle core while this file exists: ' +
+            current_path + os.sep + 'libpaddle.' + core_suffix + '\n')
+    if not avx_supported() and libpaddle.is_compiled_with_avx():
+        sys.stderr.write(
+            "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
+            "you should reinstall paddlepaddle with no-avx core.\n")
+    raise e
 
 
 def set_paddle_custom_device_lib_path(lib_path):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index a0275ac57ce45..c793515379547 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -381,7 +381,8 @@ def keep_name_table(self, value):
 
 def _parse_save_configs(configs):
     supported_configs = [
-        'output_spec', "with_hook", "combine_params", "clip_extra"
+        'output_spec', "with_hook", "combine_params", "clip_extra",
+        "skip_forward"
     ]
 
     # input check
@@ -397,6 +398,7 @@ def _parse_save_configs(configs):
     inner_config.with_hook = configs.get('with_hook', False)
     inner_config.combine_params = configs.get("combine_params", False)
     inner_config.clip_extra = configs.get("clip_extra", False)
+    inner_config.skip_forward = configs.get("skip_forward", False)
 
     return inner_config
 
@@ -522,7 +524,10 @@ def _build_load_path_and_config(path, config):
             "don't know which one to load, please make sure that the specified target "
             "of ``path`` is unique." % (path, path))
     elif not prefix_format_exist and not directory_format_exist:
-        raise ValueError("The ``path`` (%s) to load model not exists." % path)
+        raise ValueError("The ``path`` (%s) to load model not exists. "
+                         "Please make sure that *.pdmodel exists or "
+                         "don't using ``skip_forward=True`` to jit.save." %
+                         path)
     else:
         if prefix_format_exist:
             file_prefix = os.path.basename(path)
@@ -906,6 +911,7 @@ def fun(inputs):
 
     combine_vars = {}
     property_vals = []  # (value, key)
+    concrete_program = None
     for attr_func in functions:
         if isinstance(layer, Layer):
             static_func = getattr(inner_layer, attr_func, None)
@@ -921,6 +927,10 @@ def fun(inputs):
                 concrete_program = static_func.concrete_program_specify_input_spec(
                     inner_input_spec, with_hook=with_hook)
             elif 'forward' == attr_func:
+                if configs.skip_forward:
+                    # do not jit.save forward function
+                    continue
+
                 # transform in jit.save, if input_spec is incomplete, declarative will throw error
                 # inner_input_spec is list[InputSpec], it should be packed with same structure
                 # as original input_spec here.
@@ -1100,10 +1110,10 @@ def fun(inputs):
     # file `***.pdiparams.info`
 
     # "layer" can only be Layer or function or StaticFunction.
-
     contain_parameter = False
-    for var in concrete_program.main_program.list_vars():
-        contain_parameter |= isinstance(var, Parameter)
+    if concrete_program is not None:
+        for var in concrete_program.main_program.list_vars():
+            contain_parameter |= isinstance(var, Parameter)
 
     if (isinstance(layer, Layer) or contain_parameter) and extra_var_info:
         with scope_guard(scope):
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a4400d6272f9e..5747de7ddd2d4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1160,19 +1160,22 @@ def grad_var_name(var_name):
 
 def convert_np_dtype_to_dtype_(np_dtype):
     """
-    Convert the data type in numpy to the data type in Paddle
+    Convert the data type in numpy to the data type in Paddle.
 
     Args:
-        np_dtype(np.dtype): the data type in numpy.
+        np_dtype (np.dtype|str): The data type in numpy or valid data type
+            string.
 
     Returns:
-        core.VarDesc.VarType: the data type in Paddle.
+        core.VarDesc.VarType: The data type in Paddle.
 
     """
-    if np_dtype == "bfloat16":
+    # Convert the data type string to numpy data type.
+    if isinstance(np_dtype, str) and np_dtype == "bfloat16":
         dtype = np.uint16
     else:
         dtype = np.dtype(np_dtype)
+
     if dtype == np.float32:
         return core.VarDesc.VarType.FP32
     elif dtype == np.float64:
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 8ddb94efc0dd4..0b31c322b1513 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -374,19 +374,6 @@ def forward(self, var, block=None):
                                  ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
 
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(name=unique_name.generate(".".join(
-                ['normal_init', var.name, 'tmp'])),
-                                       shape=var.shape,
-                                       dtype=out_dtype,
-                                       type=VarDesc.VarType.LOD_TENSOR,
-                                       persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -394,48 +381,29 @@ def forward(self, var, block=None):
             place = _current_expected_place()
             out_var = _C_ops.gaussian_random(var.shape, self._mean,
                                              self._std_dev, self._seed,
-                                             out_dtype, place)
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
+                                             var.dtype, place)
+            out_var._share_underline_tensor_to(var)
             return None
 
         if _in_legacy_dygraph():
             out_var = _legacy_C_ops.gaussian_random(
-                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
+                'shape', var.shape, 'dtype', var.dtype, 'mean', self._mean,
                 'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False)
 
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                             'out_dtype', var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
+            out_var._share_underline_tensor_to(var)
             return None
         else:
             op = block.append_op(type="gaussian_random",
-                                 outputs={"Out": out_var},
+                                 outputs={"Out": var},
                                  attrs={
                                      "shape": var.shape,
-                                     "dtype": out_dtype,
+                                     "dtype": var.dtype,
                                      "mean": self._mean,
                                      "std": self._std_dev,
                                      "seed": self._seed,
                                      "use_mkldnn": False
                                  },
                                  stop_gradient=True)
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(type="cast",
-                                inputs={"X": out_var},
-                                outputs={"Out": var},
-                                attrs={
-                                    "in_dtype": out_var.dtype,
-                                    "out_dtype": var.dtype
-                                })
             var.op = op
             return op
 
@@ -695,7 +663,7 @@ def forward(self, var, block=None):
                                      outputs={"Out": out_var},
                                      attrs={
                                          "shape": out_var.shape,
-                                         "dtype": out_dtype,
+                                         "dtype": out_var.dtype,
                                          "mean": 0.0,
                                          "std": std,
                                          "seed": self._seed
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
index e162daf2b87e1..39d47d6f4483b 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -61,9 +61,7 @@ def build_extensions(self):
 ]
 
 # libs
-libs = [':core_avx.so']
-if not core.has_avx_core and core.has_noavx_core:
-    libs = [':core_noavx.so']
+libs = [':libpaddle.so']
 
 custom_kernel_dot_module = Extension(
     'custom_kernel_dot',
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index efe5368cdca56..ba116526d3ff4 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -62,9 +62,7 @@ def build_extensions(self):
     map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
 
 # libs
-libs = [':core_avx.so']
-if not core.has_avx_core and core.has_noavx_core:
-    libs = [':core_noavx.so']
+libs = [':libpaddle.so']
 
 custom_kernel_dot_module = Extension(
     'custom_kernel_dot',
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 099b1ddc1c01e..820e2b357aaf5 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git)
-  set(PLUGIN_TAG d5e5ac1d8e9f7588d4c2998bb3b5ffc66f65af2e)
+  set(PLUGIN_TAG b9ae8452f31525d0524810461b17856838acd821)
 
   file(
     GLOB TEST_OPS
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 27f86dc9f100a..bbccf452742a3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -37,9 +37,29 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   ${dist_ENVS})
   set_tests_properties(test_high_order_grad
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
-  py_test_modules(test_grad_clip MODULES test_grad_clip ENVS ${dist_ENVS})
-  set_tests_properties(test_grad_clip PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
-                                                 TIMEOUT 50)
+  py_test_modules(test_iterable_dataset MODULES test_iterable_dataset ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_iterable_dataset
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+  py_test_modules(test_pass_grad_clip MODULES test_pass_grad_clip ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_grad_clip
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_gradient_merge MODULES test_pass_gradient_merge
+                  ENVS ${dist_ENVS})
+  set_tests_properties(test_pass_gradient_merge
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_recompute MODULES test_pass_recompute ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_recompute
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_sharding MODULES test_pass_sharding ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_sharding
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_amp MODULES test_pass_amp ENVS ${dist_ENVS})
+  set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
+                                                TIMEOUT 50)
 
   py_test_modules(test_while_op_completion MODULES test_while_op_completion
                   ENVS ${dist_ENVS})
@@ -70,11 +90,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_process_mesh_v2 MODULES test_process_mesh_v2)
   py_test_modules(test_dist_attr_v2 MODULES test_dist_attr_v2)
   py_test_modules(test_lr_grad_clip MODULES test_lr_grad_clip)
-  py_test_modules(test_quantization MODULES test_quantization)
   py_test_modules(test_dist_matmul MODULES test_dist_matmul)
+  py_test_modules(test_process_mesh MODULES test_process_mesh)
+  py_test_modules(test_interface MODULES test_interface)
+  py_test_modules(test_strategy MODULES test_strategy)
+  py_test_modules(test_pass_quantization MODULES test_pass_quantization)
 
-  py_test_modules(test_iterable_dataset MODULES test_iterable_dataset ENVS
-                  ${dist_ENVS})
-  set_tests_properties(test_iterable_dataset
-                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
new file mode 100644
index 0000000000000..5ca2d8132e294
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+import paddle.distributed.auto_parallel as auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+
+def apply_pass(use_amp=False, level=None):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_amp:
+        amp = strategy.amp
+        amp.enable = True
+        amp.custom_white_list = ['softmax', 'layer_norm', 'gelu']
+        amp.custom_black_list = [
+            'c_softmax_with_cross_entropy', 'elementwise_div', 'reduce_sum'
+        ]
+        amp.init_loss_scaling = 32768
+        amp.use_fp16_guard = False
+        amp.use_pure_fp16 = level in ["o2", "o3"]
+        amp.use_optimizer_fp16 = level == "o3"
+        print("amp level: ", level)
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestAMPPass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.batch_size = 1
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_amp=False, level=None):
+        reset_prog()
+
+        strategy = apply_pass(use_amp, level)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("mp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=rtol or self.rtol,
+            atol=atol or self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_amp_pass(self):
+        # mp2 training
+        mp_engine = self.get_engine()
+        mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        mp_losses = np.array(mp_losses["loss"])
+
+        # mp2 amp-o1 training
+        amp_o1_engine = self.get_engine(True, "o1")
+        amp_o1_losses = amp_o1_engine.fit(self.dataset,
+                                          3,
+                                          batch_size=self.batch_size)
+        amp_o1_losses = np.array(amp_o1_losses["loss"])
+        # self.check_results(mp_losses, amp_o1_losses)
+
+        # mp2 amp-o2 training
+        amp_o2_engine = self.get_engine(True, "o2")
+        amp_o2_losses = amp_o2_engine.fit(self.dataset,
+                                          3,
+                                          batch_size=self.batch_size)
+        amp_o2_losses = np.array(amp_o2_losses["loss"])
+        # self.check_results(mp_losses, amp_o2_losses)
+
+        # mp2 amp-o3 training
+        amp_o3_engine = self.get_engine(True, "o3")
+        amp_o3_losses = amp_o3_engine.fit(self.dataset,
+                                          3,
+                                          batch_size=self.batch_size)
+        amp_o3_losses = np.array(amp_o3_losses["loss"])
+        # self.check_results(mp_losses, amp_o3_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
index d459ffd6d680d..4639abf32554e 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
@@ -32,7 +32,7 @@
 
 paddle.enable_static()
 _global_parallel_strategy = None
-_global_process_mesh = None
+_global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 batch_size = 4
 hidden_size = 1024
 sequence_len = 512
@@ -103,11 +103,7 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mappig": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _global_process_mesh, [None, None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -126,9 +122,6 @@ def mlp_pretrain_forward(train_program, start_program):
 
 
 def train():
-    global _global_process_mesh
-    _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
-
     dist_strategy = fleet.DistributedStrategy()
     dist_strategy.amp = False
     dist_strategy.pipeline = False
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
index 60a915c53cddf..1a8c5e6072cba 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
@@ -18,24 +18,21 @@
 import numpy as np
 import paddle
 
-import paddle.distributed.fleet as fleet
 import paddle.distributed.auto_parallel as auto
-
-from paddle.distributed.auto_parallel.engine import Engine
+from paddle.fluid.dygraph.parallel import ParallelEnv
 from get_gpt_model import generate_model, create_data_holder, FakeDataset
 
 paddle.enable_static()
 
 
 def apply_pass(use_sharding=False):
-    strategy = fleet.DistributedStrategy()
-    strategy.semi_auto = True
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
     if use_sharding:
-        strategy.sharding = True
-        strategy.sharding_configs = {
-            "sharding_degree": 2,
-            "stage": 2,
-        }
+        sharding = strategy.sharding
+        sharding.sharding_degree = 2
+        sharding.stage = 2
     return strategy
 
 
@@ -76,34 +73,17 @@ def init(self, engine):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        engine.mode = "train"
-        engine._executor.run(engine.startup_program)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
 
-    def get_dp2_engine(self):
+    def get_engine(self, use_sharding=False):
         reset_prog()
 
-        strategy = apply_pass()
+        strategy = apply_pass(use_sharding)
         clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
         opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
         model, loss = generate_model("dp")
-        inputs_spec, labels_spec = create_data_holder(self.batch_size)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
-        self.init(engine)
-        return engine
-
-    def get_dp2sharding2_engine(self):
-        reset_prog()
-
-        strategy = apply_pass(True)
-        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
-        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
-        model, loss = generate_model("dp")
-        inputs_spec, labels_spec = create_data_holder(self.batch_size)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
         self.init(engine)
         return engine
 
@@ -121,15 +101,13 @@ def check_result(self, dp_params, sharding_params):
 
     def test_grad_clip(self):
         # dp2 training
-        dp_engine = self.get_dp2_engine()
-        dp_engine.fit(self.dataset, batch_size=self.batch_size, use_cache=True)
+        dp_engine = self.get_engine()
+        dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
         dp_param_values = get_parameter_value(dp_engine.main_program)
 
         # dp2sharding2 training
-        sharding_engine = self.get_dp2sharding2_engine()
-        sharding_engine.fit(self.dataset,
-                            batch_size=self.batch_size,
-                            use_cache=True)
+        sharding_engine = self.get_engine(True)
+        sharding_engine.fit(self.dataset, 3, batch_size=self.batch_size)
         sharding_param_values = get_parameter_value(
             sharding_engine.main_program)
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index 104614e3e9d4e..94677645ad4e8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -27,10 +27,8 @@
 import paddle.utils as utils
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
+
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
 from paddle.optimizer.lr import CosineAnnealingDecay
 from paddle.fluid.dataloader.collate import default_collate_fn
 
@@ -47,6 +45,8 @@
 
 paddle.seed(44)
 
+is_fetch = True
+
 
 class MyDataset(Dataset):
 
@@ -90,19 +90,20 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        out = auto.shard_op(self.norm, dist_attr={"process_mesh":
-                                                  PP_MESH_0})(input)
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
-                                                     PP_MESH_1})(out)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
         out = self.dropout(out)
         out = self.linear2(out)
-        self.out = out
+        if is_fetch:
+            auto.fetch(out, "out")
         return out
 
 
 def train(fetch):
+    global is_fetch
+    is_fetch = fetch
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    dropout_ratio=0.1,
@@ -113,46 +114,34 @@ def train(fetch):
                                       beta2=0.999,
                                       epsilon=1e-08,
                                       grad_clip=None)
+    metric = paddle.metric.Accuracy()
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
 
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
 
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    engine.fit(train_dataset,
+    eval_dataset1 = MyDataset(5 * batch_size)
+    engine.fit(train_data=train_dataset,
+               epochs=2,
                batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size,
-               fetches=fetches)
+               valid_data=eval_dataset1)
 
     # eval
-    eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    eval_dataset2 = MyDataset(batch_size)
+    engine.evaluate(eval_dataset2, batch_size=batch_size)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    engine.predict(test_dataset, batch_size=batch_size)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
-    model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    model_filename = os.path.join(temp_dir.name, 'mlp')
+    engine.save(model_filename, training=True)
+    engine.load(model_filename)
     temp_dir.cleanup()
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
index 76a4772290db9..8e863e1f532bf 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
@@ -26,11 +26,9 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
+from paddle.io import Dataset, DataLoader
+
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
 
 paddle.enable_static()
 batch_size = 2
@@ -91,6 +89,7 @@ def forward(self, input):
         out = self.linear1(out)
         out = self.dropout(out)
         out = self.linear2(out)
+        auto.fetch(out, "out")
         self.out = out
         return out
 
@@ -107,46 +106,32 @@ def train(fetch):
                                                      epsilon=1e-08,
                                                      grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.amp = False
-    dist_strategy.pipeline = False
-    dist_strategy.recompute = False
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
 
     # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
-
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
 
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    engine.fit(train_dataset, batch_size=batch_size, fetches=fetches)
+    engine.fit(train_dataset, batch_size=batch_size)
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    engine.evaluate(eval_dataset, batch_size=batch_size)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    engine.predict(test_dataset, batch_size=batch_size)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
     model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    engine.save(model_filename, training=False)
     temp_dir.cleanup()
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
index 2884a03a023e5..9e32bb1cee571 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
@@ -14,8 +14,10 @@
 
 import sys
 import numpy as np
+import random
 
 import paddle
+import paddle.distributed.auto_parallel as auto
 
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
@@ -25,7 +27,7 @@
 vocab_size = 1000
 
 
-class FakeDataset:
+class FakeDataset(paddle.io.Dataset):
 
     def __init__(self, num_samples):
         self.num_samples = num_samples
@@ -33,6 +35,9 @@ def __init__(self, num_samples):
         self.vocab_size = vocab_size
 
     def __getitem__(self, idx):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
         tokens = np.random.randint(self.vocab_size, size=self.sequence_len)
         position_ids = np.arange(self.sequence_len)
         attention_mask = np.tril(np.ones(self.sequence_len)).reshape(
@@ -67,8 +72,9 @@ def create_data_holder(batch_size):
 
 def generate_model(strategy):
     modeling.init_global()
-    modeling._global_process_mesh = list(
-        range(paddle.distributed.get_world_size()))
+    ranks = list(range(paddle.distributed.get_world_size()))
+    modeling._global_process_mesh = auto.ProcessMesh(mesh=ranks,
+                                                     dim_names=["x"])
     if strategy == "serial":
         modeling._global_parallel_strategy = "serial"
     elif strategy == "mp":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
new file mode 100644
index 0000000000000..75aa7d9c1e05f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+import paddle.distributed.auto_parallel as auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass(use_gradient_merge=False):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_gradient_merge:
+        gradient_merge = strategy.gradient_merge
+        gradient_merge.enable = True
+        gradient_merge.k_steps = 4
+        gradient_merge.avg = True
+
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestGradientMergePass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.batch_size = 8
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_gradient_merge=False):
+        reset_prog()
+
+        strategy = apply_pass(use_gradient_merge)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("dp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_gradient_merge_pass(self):
+        # dp2 training
+        dp_engine = self.get_engine()
+        dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        dp_losses = np.array(dp_losses["loss"])
+
+        # dp2 gradient merge training
+        gm_engine = self.get_engine(True)
+        gm_losses = gm_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        gm_losses = np.array(gm_losses["loss"])
+
+        avg_loss = 0
+        pass_avg_ret_list = []
+        for i, pass_ret in enumerate(gm_losses):
+            if (i + 1) % 4 == 0:
+                avg_loss += pass_ret
+                pass_avg_ret_list.append(avg_loss / 4)
+                avg_loss = 0
+            else:
+                avg_loss += pass_ret
+
+        self.check_results(dp_losses, np.array(pass_avg_ret_list))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
index 9ab49b30d9d67..85a6189985136 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -17,11 +17,7 @@
 import unittest
 import numpy as np
 import paddle.distributed.auto_parallel as auto
-
-from paddle.static import InputSpec
-from paddle.distributed import fleet
 from paddle.incubate.autograd import Hessian
-from paddle.distributed.auto_parallel.engine import Engine
 
 np.random.seed(1234)
 paddle.seed(1234)
@@ -87,7 +83,7 @@ def forward(self, inputs, bc_index):
         return eq_loss, bc_u
 
 
-class LaplaceDataset:
+class LaplaceDataset(paddle.io.Dataset):
 
     def __init__(self, num_sample):
         self.num_sample = num_sample
@@ -129,23 +125,14 @@ def main():
     # model
     laplace = LaplaceModel()
 
-    # spec
-    inputs_spec = [
-        InputSpec([100, 2], 'float32', 'x'),
-        InputSpec([36], 'int64', 'bc_idx')
-    ]
-    labels_spec = InputSpec([36, 1], 'float32', 'bc_v')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    engine = Engine(laplace,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer=optimizer, loss=loss_func)
-    engine.fit(train_dataset, batch_size=None)
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+
+    engine = auto.Engine(laplace,
+                         loss=loss_func,
+                         optimizer=optimizer,
+                         strategy=dist_strategy)
+    engine.fit(train_dataset, train_sample_split=2, batch_size=None)
 
     dist_context = engine.dist_context
     block = engine.main_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py b/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
index 4ca3d14f7165a..7bb183c54c938 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
@@ -28,9 +28,8 @@
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
 from paddle.static import InputSpec
-from paddle.distributed import fleet
+
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
 from paddle.optimizer.lr import CosineAnnealingDecay
 from paddle.fluid.dataloader.collate import default_collate_fn
 
@@ -48,10 +47,9 @@
 paddle.seed(44)
 
 
-class MyDataset(IterableDataset):
+class MyDataset(paddle.io.IterableDataset):
 
     def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
         self.num_samples = num_samples
 
     def __iter__(self):
@@ -61,10 +59,9 @@ def __iter__(self):
             yield input, label
 
 
-class MyDataset1(Dataset):
+class MyDataset1(paddle.io.Dataset):
 
     def __init__(self, num_samples):
-        super(MyDataset1, self).__init__()
         self.num_samples = num_samples
         self.data = []
         for i in range(self.num_samples):
@@ -112,12 +109,10 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        out = auto.shard_op(self.norm, dist_attr={"process_mesh":
-                                                  PP_MESH_0})(input)
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
-                                                     PP_MESH_1})(out)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
         out = self.dropout(out)
         out = self.linear2(out)
         self.out = out
@@ -136,54 +131,36 @@ def train(fetch):
                                       epsilon=1e-08,
                                       grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
     dist_strategy.split_data = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
 
     # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
-
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
 
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    train_dataset1 = MyDataset1(batch_num)
-    engine.fit(train_dataset,
-               epochs=2,
-               batch_size=batch_size,
-               steps_per_epoch=batch_num,
-               fetches=fetches)
-
-    engine.fit(train_dataset1,
-               epochs=2,
-               batch_size=None,
-               steps_per_epoch=batch_num,
-               fetches=fetches)
+    engine.fit(train_dataset, epochs=2, batch_size=batch_size)
+
+    train_dataset1 = MyDataset1(batch_size * batch_num)
+    engine.fit(train_dataset1, epochs=2, batch_size=None)
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    engine.evaluate(eval_dataset, batch_size=batch_size)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    engine.predict(test_dataset, batch_size=batch_size)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
     model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    engine.save(model_filename, training=False)
     temp_dir.cleanup()
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
index 8e058d16b87b3..a245329a93a95 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
@@ -27,10 +27,8 @@
 import paddle.utils as utils
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
+
 import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
 from engine_api_dp import MyDataset
 
 paddle.enable_static()
@@ -43,20 +41,6 @@
 
 paddle.seed(44)
 
-# class MyDataset(Dataset):
-
-#     def __init__(self, num_samples):
-#         super(MyDataset, self).__init__()
-#         self.num_samples = num_samples
-
-#     def __getitem__(self, index):
-#         input = np.random.uniform(size=image_size).astype("float32")
-#         label = np.random.randint(0, class_num - 1, dtype="int64")
-#         return input, label
-
-#     def __len__(self):
-#         return self.num_samples
-
 
 class MLPLayer(nn.Layer):
 
@@ -107,50 +91,33 @@ def train(fetch):
                                                      epsilon=1e-08,
                                                      grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.amp = False
-    dist_strategy.pipeline = False
-    dist_strategy.recompute = False
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    dist_strategy.sharding = True
-    dist_strategy.sharding_configs = {
-        "sharding_degree": 2,
-        "stage": 3,
-        "enable_tuning": True,
-    }
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    # init engine
-    import tempfile
-    tmp_dir = tempfile.TemporaryDirectory()
-    dataset = MyDataset(batch_num * batch_size)
-
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+    # sharding config
+    sharding = dist_strategy.sharding
+    sharding.enable = True
+    sharding.sharding_degree = 2
+    sharding.stage = 3
+    sharding.enable_tuning = True
+    sharding.tuning_range = [0, 1, 2, 3]
     # Tuning configuration
-    tuning_config = {
-        "batch_size": batch_size,
-        "dataset": dataset,
-        "profile_start_step": 1,
-        "profile_end_step": 5,
-        "run_after_tuning": True,
-        "sharding": {
-            "stage_range": [0, 1, 2, 3]
-        },
-        "verbose": True,
-    }
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy,
-                    user_tuning_config=tuning_config)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
+    tuning = dist_strategy.tuning
+    tuning.enable = True
+    tuning.profile_start_step = 1
+    tuning.profile_end_step = 5
+    tuning.run_after_tuning = True
+    tuning.verbose = True
+
+    dataset = MyDataset(batch_num * batch_size)
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
+    engine._tune(dataset, batch_size=batch_size)
 
     # check tuned
-    assert (engine._dist_contexts['train'].strategy.sharding_configs['stage'] !=
-            3)
+    assert (engine._dist_contexts['train'].strategy.sharding.stage != 3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
new file mode 100644
index 0000000000000..271752deca077
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+import paddle.distributed.auto_parallel as auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+
+def apply_pass(use_recompute=False):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_recompute:
+        recompute = strategy.recompute
+        recompute.enable = True
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestRecomputePass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-6
+        self.atol = 1e-8
+        self.batch_size = 1
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2022)
+        np.random.seed(2022)
+        random.seed(2022)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_recompute=False):
+        reset_prog()
+
+        strategy = apply_pass(use_recompute)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("mp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_recompute_pass(self):
+        # mp2 training
+        mp_engine = self.get_engine()
+        mp_losses = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        mp_losses = np.array(mp_losses["loss"])
+
+        # mp2 recompute training
+        rc_engine = self.get_engine(True)
+        rc_losses = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        rc_losses = np.array(rc_losses["loss"])
+        self.check_results(mp_losses, rc_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
new file mode 100644
index 0000000000000..70dfd5f87df99
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+import paddle.distributed.auto_parallel as auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass(use_sharding=False, stage=None):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_sharding:
+        sharding = strategy.sharding
+        sharding.enable = True
+        sharding.sharding_degree = 2
+        sharding.stage = 1
+
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestShardingPass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-6
+        self.atol = 1e-8
+        self.batch_size = 2
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2022)
+        np.random.seed(2022)
+        random.seed(2022)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_sharding=False, stage=None):
+        reset_prog()
+
+        strategy = apply_pass(use_sharding, stage)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("dp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_sharding_pass(self):
+        # dp2 training
+        dp_engine = self.get_engine()
+        dp_losses = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        dp_losses = np.array(dp_losses["loss"])
+
+        # sharding2 stage1 training
+        sharding1_engine = self.get_engine(True, 1)
+        sharding1_losses = sharding1_engine.fit(self.dataset,
+                                                3,
+                                                batch_size=self.batch_size)
+        sharding1_losses = np.array(sharding1_losses["loss"])
+        self.check_results(dp_losses, sharding1_losses)
+
+        # sharding2 stage2 training
+        sharding2_engine = self.get_engine(True, 2)
+        sharding2_losses = sharding2_engine.fit(self.dataset,
+                                                3,
+                                                batch_size=self.batch_size)
+        sharding2_losses = np.array(sharding2_losses["loss"])
+        self.check_results(dp_losses, sharding2_losses)
+
+        # sharding2 stage3 training
+        sharding3_engine = self.get_engine(True, 3)
+        sharding3_losses = sharding3_engine.fit(self.dataset,
+                                                3,
+                                                batch_size=self.batch_size)
+        sharding3_losses = np.array(sharding3_losses["loss"])
+        self.check_results(dp_losses, sharding3_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
index 0fbe4f5bd3d09..d797df3b8ad15 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
@@ -45,9 +45,10 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]],
+                                        dim_names=["x", "y", "z"])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 
 
 class MLPLayer(nn.Layer):
@@ -74,16 +75,8 @@ def __init__(self,
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 1]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "y"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -111,16 +104,8 @@ def mlp_forward(train_program, start_program):
         embedding = paddle.nn.Embedding(10, hidden_size, sparse=True)
         embedding_out = embedding(fill_constant_out)
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, ["x", None])
+        auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
index 62d87fcc191ad..5a8e59b2969b0 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -34,7 +34,10 @@
 batch_size = 4
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = [[0, 1], [2, 3]]
+_g_process_mesh = [
+    auto.ProcessMesh([0, 1], dim_names=["x"]),
+    auto.ProcessMesh([2, 3], dim_names=["x"])
+]
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -82,18 +85,10 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh[0], [None, "x"])
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh[1], ["x", None])
         out = self.linear1(out)
 
         return out
@@ -123,16 +118,8 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [0, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh[0], ["x", None, None])
+        auto.shard_tensor(label, _g_process_mesh[0], ["x", None, None])
 
         mlp_start = MLPLayer(hidden_size=hidden_size,
                              intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
index 0b81b5bd48ca5..0cf5fca08acd8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
@@ -42,19 +42,13 @@ def make_program_lookup_table_v1_mp_dp():
             is_sparse=False)
         loss = paddle.fluid.layers.reduce_mean(emb_out)
 
-        auto.shard_tensor(src_ids,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([[0, 1], [2,
-                                                                         3]]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(
+            src_ids, auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+            ["x", None, None])
         emb_weight = block.vars["emb_weight"]
-        auto.shard_tensor(emb_weight,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([[0, 1], [2,
-                                                                         3]]),
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(
+            emb_weight, auto.ProcessMesh([[0, 1], [2, 3]],
+                                         dim_names=["x", "y"]), ["y", None])
 
     return main_program, start_program, loss
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
index 8cf2b47660fe5..77c6888d26e10 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
@@ -22,82 +22,58 @@
 
 paddle.enable_static()
 
-mesh = [[0, 1], [2, 3]]
+mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
 
 
 def init_x_row(trans_x):
     if trans_x:
         x = paddle.static.data(name='x', shape=[10, 6, 8], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, 1, -1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", "y", None])
+
         return x
     else:
         x = paddle.static.data(name='x', shape=[10, 8, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, -1, 1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", None, "y"])
+
         return x
 
 
 def init_x_col(trans_x):
     if trans_x:
         x = paddle.static.data(name='x', shape=[6, 8], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(x, mesh, [None, "x"])
+
         return x
     else:
         x = paddle.static.data(name='x', shape=[8, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", None])
+
         return x
 
 
 def init_y_row(trans_y):
     if trans_y:
         y = paddle.static.data(name='y', shape=[4, 6], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(y, mesh, [None, "y"])
+
         return y
     else:
         y = paddle.static.data(name='y', shape=[6, 4], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(y, mesh, ["y", None])
+
         return y
 
 
 def init_y_col(trans_y):
     if trans_y:
         y = paddle.static.data(name='y', shape=[4, 6], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(y, mesh, ["y", None])
+
         return y
     else:
         y = paddle.static.data(name='y', shape=[6, 4], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(y, mesh, [None, "y"])
+
         return y
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
index 734bd7acf9dec..cf220a2049a31 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
@@ -71,11 +71,8 @@ def make_program():
                                            shape=[4, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[2, 8], value=1, dtype='float32')
                 weight_attr = paddle.ParamAttr()
@@ -121,17 +118,12 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
 
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -141,12 +133,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -154,26 +143,20 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
                 out1 = paddle.fluid.layers.matmul(out,
                                                   param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
                 tmp_out = paddle.fluid.layers.matmul(out1, tmp_param)
                 out2 = paddle.fluid.layers.matmul(tmp_out,
                                                   param2)  # [8, 4] [-1, 0]
@@ -227,17 +210,12 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
 
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -247,12 +225,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -260,25 +235,20 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
                 out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
+
                 tmp_out = paddle.matmul(out1, tmp_param)
                 out2 = paddle.matmul(tmp_out, param2)  # [8, 4] [-1, 0]
 
@@ -331,17 +301,11 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
-
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -351,12 +315,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -364,25 +325,21 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
+
                 out1 = paddle.fluid.layers.mul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
+
                 tmp_out = paddle.fluid.layers.mul(out1, tmp_param)
                 out2 = paddle.fluid.layers.mul(tmp_out,
                                                param2)  # [8, 4] [-1, 0]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
index dfddba3dda1c9..14783dd891152 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
@@ -29,11 +29,8 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
@@ -44,11 +41,8 @@ def make_program_serial():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0]),
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0], dim_names=["x"]),
+                          [None, None, None])
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
index 60b43ef9fe3bc..e563e7554e905 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
@@ -29,11 +29,9 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+
         tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2])
         tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8])
         tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index e12fd0f922a5e..a1098899e3c53 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -25,11 +25,9 @@ def make_program_dp2():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
@@ -42,11 +40,9 @@ def make_program_serial():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0]),
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0], dim_names=["x"]),
+                          [None, None, None])
+
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py
new file mode 100644
index 0000000000000..6f0b73d83a744
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.static as static
+import paddle.distributed as dist
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+process_mesh1 = ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                            dim_names=["x", "y"])
+process_mesh2 = ProcessMesh(mesh=[0, 1, 2, 3], dim_names=["x"])
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
+
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        auto.shard_tensor(self.linear0.weight, process_mesh1[0], [None, "y"])
+        linear0 = auto.shard_op(self.linear0, process_mesh1,
+                                [["y", None, None]], [[None, "x", None]])
+        linear0_out = linear0(input)
+
+        gelu = auto.shard_op(F.gelu, process_mesh1, [["y", "x", None], None])
+        gelu_out = gelu(linear0_out, approximate=True)
+
+        auto.shard_tensor(self.linear1.weight, shard_spec=["y", None])
+        linear1 = auto.shard_op(self.linear1,
+                                process_mesh1[1],
+                                out_shard_specs=[["y", None, None]])
+        linear1_out = linear1(gelu_out)
+
+        return self.linear0, self.linear1, linear0_out, gelu_out, linear1_out
+
+
+class TestAutoParallelAPI(unittest.TestCase):
+
+    def test_api(self):
+        # input
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input, process_mesh1, ["x", None, None])
+        auto.shard_tensor(label, process_mesh1, ["y", None, None])
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+
+        with ProcessMesh(process_mesh1.mesh, process_mesh1.dim_names):
+            linear0, linear1, linear0_out, gelu_out, linear1_out = mlp(input)
+
+        default_program = paddle.fluid.default_main_program()
+        default_dist_context = get_default_distributed_context()
+
+        self.assertEqual(len(default_program.blocks[0].ops), 5)
+        matmul0 = default_program.blocks[0].ops[0]
+        self.assertEqual(matmul0.type, "matmul_v2")
+        ewise_add0 = default_program.blocks[0].ops[1]
+        self.assertEqual(ewise_add0.type, "elementwise_add")
+        gelu = default_program.blocks[0].ops[2]
+        self.assertEqual(gelu.type, "gelu")
+        matmul1 = default_program.blocks[0].ops[3]
+        self.assertEqual(matmul1.type, "matmul_v2")
+        ewise_add1 = default_program.blocks[0].ops[4]
+        self.assertEqual(ewise_add1.type, "elementwise_add")
+
+        dist_input = default_dist_context.get_dist_tensor_for_program(input)
+        self.assertEqual(dist_input.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_input.dist_attr.dims_mapping, [0, -1, -1])
+        self.assertTrue(dist_input.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(dist_input.dist_attr.is_annotated("dims_mapping"))
+
+        dist_input = default_dist_context.get_dist_tensor_for_program(label)
+        self.assertEqual(dist_input.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_input.dist_attr.dims_mapping, [1, -1, -1])
+        self.assertTrue(dist_input.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(dist_input.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear0_weight = default_dist_context.get_dist_tensor_for_program(
+            linear0.weight)
+        self.assertEqual(dist_linear0_weight.dist_attr.process_mesh,
+                         process_mesh1[0])
+        self.assertEqual(dist_linear0_weight.dist_attr.dims_mapping, [-1, 0])
+        self.assertTrue(
+            dist_linear0_weight.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(
+            dist_linear0_weight.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear1_weight = default_dist_context.get_dist_tensor_for_program(
+            linear1.weight)
+        self.assertEqual(dist_linear1_weight.dist_attr.process_mesh,
+                         process_mesh1)
+        self.assertEqual(dist_linear1_weight.dist_attr.dims_mapping, [1, -1])
+        self.assertTrue(
+            dist_linear1_weight.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(
+            dist_linear1_weight.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear1_out = default_dist_context.get_dist_tensor_for_program(
+            linear1_out)
+        self.assertEqual(dist_linear1_out.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_linear1_out.dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(dist_linear1_out.dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(
+            dist_linear1_out.dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(matmul0)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(input.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(ewise_add0)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+            linear0_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, 0, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(gelu)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(
+            linear0_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [1, 0, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(gelu_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(matmul1)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(gelu_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(ewise_add1)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+            linear1_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(tensor_dist_attr.dims_mapping, [0, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
index 35301e4489598..c0ff991ca52fe 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
@@ -26,7 +26,6 @@
 from paddle.io import Dataset
 from paddle.static import InputSpec
 from paddle.fluid.framework import _non_static_mode
-from paddle.distributed.auto_parallel.engine import Engine
 
 from test_to_static import MLPLayer, MyDataset
 
@@ -60,15 +59,13 @@ def init_dataset(self):
         self.dataset = MyDataset(self.batch_num * self.batch_size)
 
     def init_engine(self):
-        inputs = InputSpec([self.batch_size, self.hidden_size], 'float32', 'x')
-        labels = InputSpec([self.batch_size], 'int64', 'label')
+        # inputs = InputSpec([self.batch_size, self.hidden_size], 'float32', 'x')
+        # labels = InputSpec([self.batch_size], 'int64', 'label')
 
-        self.engine = Engine(model=self.mlp,
-                             inputs_spec=inputs,
-                             labels_spec=labels)
-        self.engine.prepare(optimizer=self.optimizer,
-                            loss=self.loss,
-                            metrics=paddle.metric.Accuracy())
+        self.engine = auto.Engine(model=self.mlp,
+                                  loss=self.loss,
+                                  optimizer=self.optimizer,
+                                  metrics=paddle.metric.Accuracy())
 
 
 class TestLRScheduler(TestEngineBase):
@@ -80,9 +77,9 @@ def init_optimizer(self):
 
     def test_lr_scheduler(self):
         self.init_engine()
-        lr = self.engine._optimizer._learning_rate
-        assert isinstance(lr, paddle.optimizer.lr.LRScheduler)
         self.engine.fit(self.dataset, batch_size=self.batch_size)
+        lr = self.engine._lr_optimizer._learning_rate
+        assert isinstance(lr, paddle.optimizer.lr.LRScheduler)
 
 
 class TestGradClipByGlobalNorm(TestEngineBase):
@@ -94,7 +91,6 @@ def init_optimizer(self):
 
     def test_grad_clip(self):
 
-        clip = self.engine._optimizer._grad_clip
         self.engine.fit(self.dataset, batch_size=self.batch_size)
         self.check_program()
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py
new file mode 100644
index 0000000000000..ed2cf0328e85c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestAMPPass(unittest.TestCase):
+
+    def test_mp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "amp_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_grad_clip.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_grad_clip.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/auto_parallel/test_grad_clip.py
rename to python/paddle/fluid/tests/unittests/auto_parallel/test_pass_grad_clip.py
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py
new file mode 100644
index 0000000000000..e55ddbea58336
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestGradientMergePass(unittest.TestCase):
+
+    def test_dp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir,
+                                         "gradient_merge_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py
new file mode 100644
index 0000000000000..ff96f43a928a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+import paddle.distributed.auto_parallel as auto
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass():
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+    qat = dist_strategy.qat
+    qat.enable = True
+    qat.channel_wise_abs_max = True
+    qat.weight_bits = 8
+    qat.activation_bits = 8
+    qat.not_quant_pattern = ['skip_quant']
+    return dist_strategy
+
+
+class TestQuantizationPass(unittest.TestCase):
+
+    def test_qat_pass(self):
+
+        batch_size = 8
+        batch_num = 10
+
+        strategy = apply_pass()
+        model, loss = generate_model("serial")
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001)
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        dataset = FakeDataset(batch_size * batch_num)
+        engine.fit(dataset, 3, batch_size=batch_size)
+
+        self.check_program(engine.main_program)
+
+    def check_program(self, program):
+
+        quantizable_op_and_inputs = {'matmul_v2': ['X', 'Y']}
+        quantizable_grad_op_inputs = {'matmul_v2_grad': ['X', 'Y']}
+
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                is_quntized = False
+                if op.type in quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        if ".quantized" in arg_name:
+                            is_quntized = True
+
+                if not is_quntized:
+                    continue
+
+                # check forward
+                if op.type in quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        assert arg_name.endswith('.quantized.dequantized')
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                is_quntized = False
+                if op.type in quantizable_grad_op_inputs:
+                    for pname in quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        if ".quantized" in arg_name:
+                            is_quntized = True
+
+                if not is_quntized:
+                    continue
+
+                # check backward
+                if op.type in quantizable_grad_op_inputs:
+                    for pname in quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        assert arg_name.endswith('.quantized.dequantized')
+                        assert arg_name in quantized_ops
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py
new file mode 100644
index 0000000000000..e7eb7ddd2a604
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestRecomputePass(unittest.TestCase):
+
+    def test_mp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "recompute_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py
new file mode 100644
index 0000000000000..77e969c83bf81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestShardingPass(unittest.TestCase):
+
+    def test_dp2sharding2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "sharding_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py
new file mode 100644
index 0000000000000..4232d64071e14
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.static as static
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        return out
+
+
+class TestProcessMesh(unittest.TestCase):
+
+    def test_construction(self):
+        mesh = [[0, 1, 2], [3, 4, 5]]
+        process_mesh = ProcessMesh(mesh, dim_names=["x", "y"])
+        self.assertEqual(process_mesh.shape, [2, 3])
+        self.assertEqual(process_mesh.process_ids, [0, 1, 2, 3, 4, 5])
+        self.assertEqual(process_mesh.dim_names, ["x", "y"])
+        self.assertEqual(process_mesh.ndim, 2)
+        self.assertEqual(process_mesh, process_mesh)
+        self.assertEqual(str(process_mesh), str(process_mesh))
+
+        sub_process_mesh1 = process_mesh[0]
+        self.assertEqual(sub_process_mesh1.shape, [3])
+        self.assertEqual(sub_process_mesh1.process_ids, [0, 1, 2])
+        self.assertEqual(sub_process_mesh1.dim_names, ["y"])
+        self.assertEqual(sub_process_mesh1.ndim, 1)
+
+        sub_process_mesh2 = process_mesh[:, 1]
+        self.assertEqual(sub_process_mesh2.shape, [2])
+        self.assertEqual(sub_process_mesh2.process_ids, [1, 4])
+        self.assertEqual(sub_process_mesh2.dim_names, ["x"])
+        self.assertEqual(sub_process_mesh2.ndim, 1)
+
+        sub_process_mesh3 = sub_process_mesh2[:]
+        self.assertEqual(sub_process_mesh3.shape, [2])
+        self.assertEqual(sub_process_mesh3.process_ids, [1, 4])
+        self.assertEqual(sub_process_mesh3.dim_names, ["x"])
+        self.assertEqual(sub_process_mesh3.ndim, 1)
+
+        sub_process_mesh4 = process_mesh[1, 1]
+        self.assertEqual(sub_process_mesh4.shape, [1])
+        self.assertEqual(sub_process_mesh4.process_ids, [4])
+        self.assertEqual(sub_process_mesh4.dim_names, ["d0"])
+        self.assertEqual(sub_process_mesh4.ndim, 1)
+
+    def test_context_manager(self):
+        mesh = np.array([1, 2, 3, 4])
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+
+        with ProcessMesh(mesh, "d"):
+            out = mlp(input)
+
+        default_program = paddle.fluid.default_main_program()
+        default_dist_context = get_default_distributed_context()
+
+        for block in default_program.blocks:
+            for tensor in block.vars.values():
+                dist_tensor = default_dist_context.get_dist_tensor_for_program(
+                    tensor)
+                if dist_tensor is not None:
+                    self.assertEqual(dist_tensor.dist_attr.process_mesh,
+                                     ProcessMesh(mesh))
+            for op in block.ops:
+                dist_op = default_dist_context.get_dist_op_for_program(op)
+                if dist_op is not None:
+                    self.assertEqual(dist_op.dist_attr.process_mesh,
+                                     ProcessMesh(mesh))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
index fcfafcb3e6d6d..3c58f9e8cd393 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
@@ -13,7 +13,8 @@
 # limitations under the License
 
 import unittest
-from paddle.distributed.auto_parallel.process_mesh_v2 import ProcessMesh
+from paddle.distributed.auto_parallel.process_mesh_v2 import (
+    ProcessMesh, compute_compatible_process_mesh, merge_process_mesh)
 
 
 class TestProcessMesh(unittest.TestCase):
@@ -39,6 +40,54 @@ def test_process_mesh(self):
         self.assertNotEqual(process_mesh, process_mesh2)
         self.assertEqual(str(process_mesh), str(process_mesh))
 
+    def test_compute_compatible_process_mesh(self):
+        process_mesh1 = ProcessMesh([[0, 1, 2], [3, 4, 5]],
+                                    dim_names=["x", "y"])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, None])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [None, process_mesh1])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2], [3, 4, 5]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+        self.assertEqual(compatible_process_mesh, process_mesh2)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2, 3, 4, 5]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+    def test_merge_process_mesh(self):
+        process_mesh1 = ProcessMesh([[0, 1, 2], [3, 4, 5]],
+                                    dim_names=["x", "y"])
+        merged_process_mesh = merge_process_mesh([process_mesh1, None])
+        print(merged_process_mesh)
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+        merged_process_mesh = merge_process_mesh([None, process_mesh1])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[0, 1, 2], [3, 4, 5]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[0, 1, 2]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[6, 7]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh,
+                         ProcessMesh([0, 1, 2, 3, 4, 5, 6, 7]))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py
deleted file mode 100644
index f84ee03e0c940..0000000000000
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import sys
-import numpy as np
-import paddle
-
-import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
-
-from paddle.distributed.auto_parallel.engine import Engine
-from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
-
-sys.path.append("..")
-import auto_parallel_gpt_model as modeling
-from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
-
-paddle.enable_static()
-
-
-class FakeDataset:
-
-    def __init__(self, num_samples, sequence_len, vocab_size):
-        self.num_samples = num_samples
-        self.sequence_len = sequence_len
-        self.vocab_size = vocab_size
-
-    def __getitem__(self, idx):
-        tokens = np.random.randint(self.vocab_size, size=self.sequence_len)
-        position_ids = np.arange(self.sequence_len)
-        attention_mask = np.tril(np.ones(self.sequence_len)).reshape(
-            (1, self.sequence_len, self.sequence_len)).astype(np.float32)
-        labels = np.random.randint(self.vocab_size, size=self.sequence_len)
-        loss_mask = np.ones(self.sequence_len).astype(np.float32)
-        return tokens, position_ids, attention_mask, labels, loss_mask
-
-    def __len__(self):
-        return self.num_samples
-
-
-def apply_pass():
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    dist_strategy.qat = True
-    dist_strategy.qat_configs = {
-        'channel_wise_abs_max': True,
-        'weight_bits': 8,
-        'activation_bits': 8,
-        'not_quant_pattern': ['skip_quant'],
-    }
-    return dist_strategy
-
-
-def create_data_holder(batch_size, sequence_len):
-    tokens = paddle.static.InputSpec(name="tokens",
-                                     shape=[batch_size, sequence_len],
-                                     dtype='int64')
-    position_ids = paddle.static.InputSpec(name="position_ids",
-                                           shape=[batch_size, sequence_len],
-                                           dtype='int64')
-    attention_mask = paddle.static.InputSpec(
-        name="attention_mask",
-        shape=[batch_size, 1, sequence_len, sequence_len],
-        dtype='float32')
-    labels = paddle.static.InputSpec(name="labels",
-                                     shape=[batch_size, sequence_len],
-                                     dtype='int64')
-    loss_mask = paddle.static.InputSpec(name="loss_mask",
-                                        shape=[batch_size, sequence_len],
-                                        dtype='float32')
-    return [tokens, position_ids, attention_mask], [labels, loss_mask]
-
-
-def get_gpt_model():
-    modeling.init_global()
-    modeling._global_parallel_strategy = "serial"
-    modeling._global_process_mesh = auto.ProcessMesh(mesh=[0])
-
-    gpt = GPTModel(vocab_size=1000,
-                   hidden_size=64,
-                   num_hidden_layers=2,
-                   num_attention_heads=8,
-                   intermediate_size=256,
-                   hidden_act="gelu",
-                   hidden_dropout_prob=0.0,
-                   attention_probs_dropout_prob=0.0,
-                   max_position_embeddings=1024,
-                   type_vocab_size=1,
-                   initializer_range=0.02,
-                   pad_token_id=0,
-                   eos_token_id=7,
-                   bos_token_id=0,
-                   eol_token_id=3)
-    model = GPTForPretraining(gpt,
-                              vocab_size=1000,
-                              hidden_size=64,
-                              initializer_range=0.02)
-    criterion = GPTPretrainingCriterion()
-    return model, criterion
-
-
-class TestQuantizationPass(unittest.TestCase):
-
-    def test_qat_pass(self):
-
-        batch_size = 8
-        batch_num = 10
-        sequence_len = 512
-        vocab_size = 1000
-
-        strategy = apply_pass()
-        model, loss = get_gpt_model()
-        opt = paddle.optimizer.AdamW(learning_rate=0.00001)
-        inputs_spec, labels_spec = create_data_holder(batch_size=batch_size,
-                                                      sequence_len=sequence_len)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
-
-        dataset = FakeDataset(batch_size * batch_num, sequence_len, vocab_size)
-        engine.fit(train_data=dataset, batch_size=batch_size)
-
-        self.check_program(engine.main_program)
-
-    def check_program(self, program):
-
-        quantizable_op_and_inputs = {'matmul_v2': ['X', 'Y']}
-        quantizable_grad_op_inputs = {'matmul_v2_grad': ['X', 'Y']}
-
-        quantized_ops = set()
-        for block in program.blocks:
-            for op in block.ops:
-                is_quntized = False
-                if op.type in quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        if ".quantized" in arg_name:
-                            is_quntized = True
-
-                if not is_quntized:
-                    continue
-
-                # check forward
-                if op.type in quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        assert arg_name.endswith('.quantized.dequantized')
-                        quantized_ops.add(arg_name)
-
-            for op in block.ops:
-                is_quntized = False
-                if op.type in quantizable_grad_op_inputs:
-                    for pname in quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        if ".quantized" in arg_name:
-                            is_quntized = True
-
-                if not is_quntized:
-                    continue
-
-                # check backward
-                if op.type in quantizable_grad_op_inputs:
-                    for pname in quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        assert arg_name.endswith('.quantized.dequantized')
-                        assert arg_name in quantized_ops
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
new file mode 100644
index 0000000000000..9fae8d970b2bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+# import yaml
+import unittest
+import paddle.distributed.auto_parallel as auto
+
+
+class TestStrategy(unittest.TestCase):
+
+    def test_default_config(self):
+        strategy = auto.Strategy()
+
+        recompute = strategy.recompute
+        self.assertEqual(recompute.enable, False)
+        self.assertEqual(recompute.checkpoints, None)
+
+        amp = strategy.amp
+        self.assertEqual(amp.enable, False)
+        self.assertAlmostEqual(amp.init_loss_scaling, 32768.0)
+        self.assertEqual(amp.incr_every_n_steps, 1000)
+        self.assertEqual(amp.decr_every_n_nan_or_inf, 2)
+        self.assertAlmostEqual(amp.incr_ratio, 2.0)
+        self.assertAlmostEqual(amp.decr_ratio, 0.8)
+        self.assertEqual(amp.use_dynamic_loss_scaling, True)
+        self.assertEqual(amp.custom_black_list, [])
+        self.assertEqual(amp.custom_white_list, [])
+        self.assertEqual(amp.custom_black_varnames, [])
+        self.assertEqual(amp.use_pure_fp16, False)
+        self.assertEqual(amp.use_fp16_guard, True)
+        self.assertEqual(amp.use_optimizer_fp16, False)
+
+        sharding = strategy.sharding
+        self.assertEqual(sharding.enable, False)
+        self.assertEqual(sharding.stage, 1)
+        self.assertEqual(sharding.sharding_degree, 8)
+        self.assertAlmostEqual(sharding.segment_broadcast_MB, 32.0)
+        self.assertEqual(sharding.enable_tuning, False)
+        self.assertEqual(sharding.tuning_range, [])
+
+        gradient_merge = strategy.gradient_merge
+        self.assertEqual(gradient_merge.enable, False)
+        self.assertEqual(gradient_merge.k_steps, 1)
+        self.assertEqual(gradient_merge.avg, True)
+
+        qat = strategy.qat
+        self.assertEqual(qat.enable, False)
+        self.assertEqual(qat.channel_wise_abs_max, True)
+        self.assertEqual(qat.weight_bits, 8)
+        self.assertEqual(qat.activation_bits, 8)
+        self.assertEqual(qat.not_quant_pattern, ['skip_quant'])
+        self.assertEqual(qat.algo, None)
+
+        tuning = strategy.tuning
+        self.assertEqual(tuning.enable, False)
+        self.assertEqual(tuning.batch_size, 1)
+        self.assertEqual(tuning.dataset, None)
+        self.assertEqual(tuning.profile_start_step, 1)
+        self.assertEqual(tuning.profile_end_step, 1)
+        self.assertEqual(tuning.run_after_tuning, True)
+        self.assertEqual(tuning.verbose, True)
+
+    def test_modify_config(self):
+        strategy = auto.Strategy()
+
+        recompute = strategy.recompute
+        recompute.enable = True
+        recompute.checkpoints = ["x"]
+        self.assertEqual(recompute.enable, True)
+        self.assertEqual(recompute.checkpoints, ["x"])
+
+        amp = strategy.amp
+        amp.enable = True
+        amp.init_loss_scaling = 16384.0
+        amp.incr_every_n_steps = 2000
+        amp.decr_every_n_nan_or_inf = 4
+        amp.incr_ratio = 4.0
+        amp.decr_ratio = 0.4
+        amp.use_dynamic_loss_scaling = False
+        amp.custom_white_list = ["x"]
+        amp.custom_black_list = ["y"]
+        amp.custom_black_varnames = ["z"]
+        amp.use_pure_fp16 = True
+        amp.use_fp16_guard = False
+        amp.use_optimizer_fp16 = True
+        self.assertEqual(amp.enable, True)
+        self.assertAlmostEqual(amp.init_loss_scaling, 16384.0)
+        self.assertEqual(amp.incr_every_n_steps, 2000)
+        self.assertEqual(amp.decr_every_n_nan_or_inf, 4)
+        self.assertAlmostEqual(amp.incr_ratio, 4.0)
+        self.assertAlmostEqual(amp.decr_ratio, 0.4)
+        self.assertEqual(amp.use_dynamic_loss_scaling, False)
+        self.assertEqual(amp.custom_white_list, ["x"])
+        self.assertEqual(amp.custom_black_list, ["y"])
+        self.assertEqual(amp.custom_black_varnames, ["z"])
+        self.assertEqual(amp.use_pure_fp16, True)
+        self.assertEqual(amp.use_fp16_guard, False)
+        self.assertEqual(amp.use_optimizer_fp16, True)
+
+        sharding = strategy.sharding
+        sharding.enable = True
+        sharding.stage = 2
+        sharding.sharding_degree = 2
+        sharding.segment_broadcast_MB = 64.0
+        sharding.enable_tuning = True
+        sharding.tuning_range = [1, 2, 3]
+        self.assertEqual(sharding.enable, True)
+        self.assertEqual(sharding.stage, 2)
+        self.assertEqual(sharding.sharding_degree, 2)
+        self.assertAlmostEqual(sharding.segment_broadcast_MB, 64.0)
+        self.assertEqual(sharding.enable_tuning, True)
+        self.assertEqual(sharding.tuning_range, [1, 2, 3])
+
+        gradient_merge = strategy.gradient_merge
+        gradient_merge.enable = True
+        gradient_merge.k_steps = 4
+        gradient_merge.avg = False
+        self.assertEqual(gradient_merge.enable, True)
+        self.assertEqual(gradient_merge.k_steps, 4)
+        self.assertEqual(gradient_merge.avg, False)
+
+    # def test_file_config(self):
+    #     yaml_data = """
+    #     all_ranks: false
+    #     amp:
+    #         custom_black_list:
+    #         - y
+    #         custom_black_varnames:
+    #         - z
+    #         custom_white_list:
+    #         - x
+    #         decr_every_n_nan_or_inf: 4
+    #         decr_ratio: 0.4
+    #         enable: false
+    #         incr_every_n_steps: 2000
+    #         incr_ratio: 4.0
+    #         init_loss_scaling: 16384.0
+    #         use_dynamic_loss_scaling: false
+    #         use_fp16_guard: false
+    #         use_optimizer_fp16: true
+    #         use_pure_fp16: true
+    #     auto_mode: semi
+    #     gradient_merge:
+    #         avg: false
+    #         enable: false
+    #         k_steps: 4
+    #     gradient_scale: true
+    #     qat:
+    #         activation_bits: 8
+    #         algo: null
+    #         channel_wise_abs_max: true
+    #         enable: false
+    #         not_quant_pattern:
+    #         - skip_quant
+    #         weight_bits: 8
+    #     recompute:
+    #         checkpoints: null
+    #         enable: false
+    #         enable_tuning: false
+    #     return_numpy: true
+    #     seed: null
+    #     sharding:
+    #         enable: false
+    #         enable_tuning: true
+    #         segment_broadcast_MB: 64.0
+    #         sharding_degree: 8
+    #         stage: 2
+    #         tuning_range: None
+    #     split_data: false
+    #     tuning:
+    #         batch_size: 1
+    #         dataset: null
+    #         enable: false
+    #         profile_end_step: 1
+    #         profile_start_step: 1
+    #         run_after_tuning: true
+    #         verbose: true
+    #     use_cache: true
+    #     """
+    #     yaml_path = "./strategy.yml"
+    #     yaml_dict = yaml.load(yaml_data, Loader=yaml.Loader)
+    #     with open(yaml_path, 'w') as outfile:
+    #         yaml.dump(yaml_dict, outfile, default_flow_style=False)
+
+    #     strategy = auto.Strategy(yaml_path)
+    #     self.assertEqual(yaml_dict, strategy.to_dict())
+
+    #     # Remove the created file
+    #     if os.path.exists(yaml_path):
+    #         os.remove(yaml_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
index 86832f485c162..5e545a7a63a0e 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
@@ -27,7 +27,6 @@
 from paddle.io import Dataset
 from paddle.static import InputSpec
 from paddle.fluid.framework import _non_static_mode
-from paddle.distributed.auto_parallel.engine import Engine
 from paddle.distributed.auto_parallel.helper import ProgramHelper
 
 batch_size = 4
@@ -140,23 +139,19 @@ def test_to_static(self):
 
         dataset = MyDataset(batch_num * batch_size)
 
-        inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
-        labels = InputSpec([batch_size], 'int64', 'label')
+        # inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
+        # labels = InputSpec([batch_size], 'int64', 'label')
 
-        engine = Engine(model=mlp,
-                        inputs_spec=inputs,
-                        labels_spec=labels,
-                        strategy=None)
         assert _non_static_mode() == True
-
-        engine.prepare(optimizer=optimizer,
-                       loss=loss,
-                       metrics=paddle.metric.Accuracy())
-
-        assert _non_static_mode() == False
+        engine = auto.Engine(model=mlp,
+                             loss=loss,
+                             optimizer=optimizer,
+                             metrics=paddle.metric.Accuracy(),
+                             strategy=None)
         engine.fit(dataset, batch_size=batch_size)
         engine.evaluate(dataset, batch_size=batch_size)
         engine.predict(dataset, batch_size=batch_size)
+        assert _non_static_mode() == False
 
 
 class TestLazyInit(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 3dabe38ff6e1d..1c869813d319b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -36,7 +36,7 @@
 epoch_num = 10
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = [[0, 1], [2, 3]]
+_g_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=['x', 'y'])
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -84,18 +84,12 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh[:, 0],
+                          [None, 'x'])
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh[:, 1],
+                          ['x', None])
         out = self.linear1(out)
 
         return out
@@ -155,16 +149,8 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh[:, 0], [None, None, None])
+        auto.shard_tensor(label, _g_process_mesh[:, 0], [None, None, None])
 
         mlp_start = MLPLayer(hidden_size=hidden_size,
                              intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index 3c6e086ae7fac..444e0df454d96 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -37,7 +37,7 @@
 epoch_num = 10
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = auto.ProcessMesh([0, 1])
+_g_process_mesh = auto.ProcessMesh([0, 1], dim_names=['x'])
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -85,61 +85,21 @@ def __init__(self,
 
     def forward(self, input):
 
-        auto.shard_tensor(self.norm.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(self.norm.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, 0]
-                          })
-        auto.shard_tensor(self.linear0.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [0]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear1.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(self.norm.weight, _g_process_mesh, [None])
+        auto.shard_tensor(self.norm.bias, _g_process_mesh, [None])
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh, [None, 'x'])
+        auto.shard_tensor(self.linear0.bias, _g_process_mesh, ['x'])
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh, ['x', None])
+        auto.shard_tensor(self.linear1.bias, _g_process_mesh, [None])
 
         out = self.norm(input)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, None])
         out = self.linear0(out)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, 0]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, 'x'])
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, 0]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, 'x'])
         out = self.linear1(out)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, None])
 
         return out
 
@@ -155,21 +115,13 @@ def get_program():
 
         # 循环计数器
         i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-        auto.shard_tensor(i,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(i, _g_process_mesh, [None])
 
         # 循环次数
         loop_len = fluid.layers.fill_constant(shape=[1],
                                               dtype='int64',
                                               value=epoch_num)
-        auto.shard_tensor(loop_len,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(loop_len, _g_process_mesh, [None])
 
         # input
         input = static.data(name="input",
@@ -188,25 +140,13 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh, [None, None, None])
+        auto.shard_tensor(label, _g_process_mesh, [None, None, None])
 
         # fill constant bsz like
         tmp = paddle.fluid.layers.fill_constant_batch_size_like(
             input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
-        auto.shard_tensor(tmp,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, 0, -1, -1]
-                          })
+        auto.shard_tensor(tmp, _g_process_mesh, [None, 'x', None, None])
 
         # model
         mlp_start = MLPLayer(hidden_size=hidden_size,
@@ -216,28 +156,21 @@ def get_program():
         pred = mlp_start(input)
 
         input_array = fluid.layers.array_write(pred, i)
-        auto.shard_tensor(input_array,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        # TODO: check whether this annotation is needed
+        # auto.shard_tensor(input_array,
+        #                   dist_attr={
+        #                       "process_mesh": _g_process_mesh,
+        #                       "dims_mapping": [-1, -1, -1]
+        #                   })
 
         cond = fluid.layers.less_than(x=i, y=loop_len)
-        auto.shard_tensor(cond,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(cond, _g_process_mesh, [None])
 
         while_op = fluid.layers.While(cond=cond)
         with while_op.block():
 
             pre_input = fluid.layers.array_read(array=input_array, i=i)
-            auto.shard_tensor(pre_input,
-                              dist_attr={
-                                  "process_mesh": _g_process_mesh,
-                                  "dims_mapping": [-1, -1, -1]
-                              })
+            auto.shard_tensor(pre_input, _g_process_mesh, [None, None, None])
 
             mlp_while = MLPLayer(hidden_size=hidden_size,
                                  intermediate_size=4 * hidden_size,
@@ -251,11 +184,7 @@ def get_program():
             fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
         end_pred = fluid.layers.array_read(array=input_array, i=i)
-        auto.shard_tensor(end_pred,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(end_pred, _g_process_mesh, [None, None, None])
 
         mlp_end = MLPLayer(hidden_size=hidden_size,
                            intermediate_size=4 * hidden_size,
@@ -264,18 +193,10 @@ def get_program():
         pred = mlp_end(end_pred)
 
         error_cost = paddle.nn.functional.square_error_cost(pred, label)
-        auto.shard_tensor(error_cost,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(error_cost, _g_process_mesh, [None, None, None])
 
         loss = paddle.mean(error_cost)
-        auto.shard_tensor(loss,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(loss, _g_process_mesh, [None])
 
     return train_program, start_program, dataloader, i, loss
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index c3f64e30fc596..2e65c9bd46735 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -67,38 +67,18 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, "x"])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -120,28 +100,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -186,7 +150,7 @@ def test_mlp_mp2pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         input = np.random.random(size=(80, 64)).astype('float32')
         label = np.random.random(size=(80, 1)).astype('float32')
@@ -212,11 +176,11 @@ def test_mlp_mp2pp(self):
 
         set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["pp0"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["pp1"])
 
         dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
         )
@@ -268,7 +232,7 @@ def test_mlp_pp2mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
         PP_MESH_0 = auto.ProcessMesh(mesh=[0])
         global PP_MESH_1
@@ -303,7 +267,7 @@ def test_mlp_pp2mp(self):
 
         set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
         )
@@ -350,7 +314,7 @@ def test_input_invalid(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         dist_main_prog, _, _ = get_distributed_program()
         with self.assertRaises(TypeError):
             save_distributed_checkpoint(dist_main_prog, [""], [""],
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
index 9d2b273940121..c7ce4c2326cf2 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
@@ -38,7 +38,7 @@ def test_dp2pp1mp1(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                MESH_0 = auto.ProcessMesh([0, 1])
+                MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
                 input = paddle.static.data(name='input', shape=[2, 8])
                 label = paddle.static.data(name='label', shape=[2, 8])
 
@@ -47,26 +47,10 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
-                auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
-                auto.shard_tensor(linear0.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-                auto.shard_tensor(linear1.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
+                auto.shard_tensor(input, MESH_0, ["x", None])
+                auto.shard_tensor(label, MESH_0, ["x", None])
+                auto.shard_tensor(linear0.weight, MESH_0, [None, None])
+                auto.shard_tensor(linear1.weight, MESH_0, [None, None])
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
@@ -124,7 +108,7 @@ def dp1pp1mp2(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                MESH_0 = auto.ProcessMesh([0, 1])
+                MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
                 input = paddle.static.data(name='input', shape=[8, 8])
                 label = paddle.static.data(name='label', shape=[8, 8])
 
@@ -133,27 +117,10 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-                auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-
-                auto.shard_tensor(linear0.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, 0]
-                                  })
-                auto.shard_tensor(linear1.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
+                auto.shard_tensor(input, MESH_0, [None, None])
+                auto.shard_tensor(label, MESH_0, [None, None])
+                auto.shard_tensor(linear0.weight, MESH_0, [None, "x"])
+                auto.shard_tensor(linear1.weight, MESH_0, ["x", None])
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index 8aef4d1086066..e7f721dd422cf 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -114,30 +114,18 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.q_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.q_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.q_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
+
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
         if isinstance(cache, self.StaticCache):
@@ -165,56 +153,30 @@ def compute_kv(self, key, value):
         """
         k = self.k_proj(key)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.k_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.k_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.k_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
         v = self.v_proj(value)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.v_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.v_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.v_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
@@ -287,30 +249,18 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.out_proj.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.out_proj.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [0, -1]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [1, -1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], ["y", None])
+
         outs = [out]
         if self.need_weights:
             outs.append(weights)
@@ -352,96 +302,53 @@ def forward(self,
         new_caches = []
         self.checkpoints = []
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  PP_MESH_LIST[0],
-                                  "dims_mapping":
-                                  [-1 for i in range(len(output.shape))]
-                              })
+            auto.shard_tensor(output, PP_MESH_LIST[0],
+                              [None for i in range(len(output.shape))])
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"].extends(
+                [None for i in range(len(output.shape) - 1)]))
         if _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  MPPP_MESH_LIST[0],
-                                  "dims_mapping": [-1] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, MPPP_MESH_LIST[0],
+                              [None for i in range(len(output.shape))])
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"].extends(
+                [None for i in range(len(output.shape) - 1)]))
         for i, mod in enumerate(self.layers):
             if cache is None:
                 if use_cache:
                     if _global_parallel_strategy == "pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, PP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                             tgt_mask,
+                                                             use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                PP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping":
-                                [-1 for i in range(len(output.shape))]
-                            })
+                            output, PP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, DPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends(
+                                [None for i in range(len(output.shape) - 1)]))
                     elif _global_parallel_strategy == "mp_pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                MPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [-1] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, MPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_mp_pp":
                         output, new_cache = auto.shard_op(
                             mod,
-                            dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                            tgt_mask, use_cache,
+                                                            cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPMPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPMPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     else:
                         output, new_cache = mod(output,
                                                 memory,
@@ -451,64 +358,36 @@ def forward(self,
                     new_caches.append(new_cache)
                 else:
                     if _global_parallel_strategy == "pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   PP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(mod, PP_MESH_LIST[mod.mesh_idx])(
+                            output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                PP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping":
-                                [-1 for i in range(len(output.shape))]
-                            })
+                            output, PP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   DPPP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(
+                            mod, DPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPPP_MESH_LIST[mod.mesh_idx], ["x"].extends(
+                                [None for i in range(len(output.shape) - 1)]))
                     elif _global_parallel_strategy == "mp_pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   MPPP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(
+                            mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                MPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [-1] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, MPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_mp_pp":
-                        output = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                        output = auto.shard_op(mod,
+                                               DPMPPP_MESH_LIST[mod.mesh_idx])(
+                                                   output, memory, tgt_mask,
+                                                   use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPMPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPMPPP_MESH_LIST[mod.mesh_idx],
+                            ["x"].extends(
+                                [None for i in range(len(output.shape) - 1)]))
                     else:
                         output = mod(output,
                                      memory,
@@ -519,58 +398,33 @@ def forward(self,
                 if _global_parallel_strategy == "pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                                   })(output, memory, tgt_mask, use_cache,
-                                      cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh": PP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [-1 for i in range(len(output.shape))]
-                        })
+                        PP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                    use_cache, cache)
+                    auto.shard_tensor(output, PP_MESH_LIST[mod.mesh_idx],
+                                      [None for i in range(len(output.shape))])
                 elif _global_parallel_strategy == "dp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={
-                            "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            DPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [0] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        DPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                      use_cache, cache)
+                    auto.shard_tensor(output, DPPP_MESH_LIST[mod.mesh_idx], [
+                        "x"
+                    ].extends([None for i in range(len(output.shape) - 1)]))
                 elif _global_parallel_strategy == "mp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={
-                            "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            MPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [-1] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        MPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                      use_cache, cache)
+                    auto.shard_tensor(output, MPPP_MESH_LIST[mod.mesh_idx],
+                                      [None for i in range(len(output.shape))])
                 elif _global_parallel_strategy == "dp_mp_pp":
                     output, new_cache = auto.shard_op(
-                        mod,
-                        dist_attr={
-                            "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            DPMPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [0] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        mod, DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                             tgt_mask,
+                                                             use_cache, cache)
+                    auto.shard_tensor(output, DPMPPP_MESH_LIST[mod.mesh_idx], [
+                        "x"
+                    ].extends([None for i in range(len(output.shape) - 1)]))
                 else:
                     output, new_cache = mod(output,
                                             memory,
@@ -661,55 +515,30 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], [None, "x"])
         if _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
+
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [0, -1]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [1, -1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], ["y", None])
         tgt = self.dropout2(
             self.linear2(F.gelu(self.linear1(tgt), approximate=True)))
         tgt = residual + tgt
@@ -757,29 +586,18 @@ def forward(self, input_ids, position_ids=None):
             position_ids = seq_length - ones
         input_embedings = self.word_embeddings(input_ids)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[0],
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, MPPP_MESH_LIST[0],
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, DPMPPP_MESH_LIST[0],
+                              ["y", None])
+
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
         embeddings = self.dropout(embeddings)
@@ -868,29 +686,14 @@ def forward(self,
         embedding_output = self.embeddings(input_ids=input_ids,
                                            position_ids=position_ids)
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  PP_MESH_LIST[0],
-                                  "dims_mapping":
-                                  [-1 for i in range(len(input_ids.shape))]
-                              })
+            auto.shard_tensor(input_ids, PP_MESH_LIST[0],
+                              [None for i in range(len(input_ids.shape))])
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(input_ids.shape) - 1)]
-                              })
+            auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"].extends(
+                [None for i in range(len(input_ids.shape) - 1)]))
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(input_ids.shape) - 1)]
-                              })
+            auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"].extends(
+                [None for i in range(len(input_ids.shape) - 1)]))
         encoder_outputs = self.decoder(embedding_output,
                                        memory=None,
                                        tgt_mask=attention_mask,
@@ -923,6 +726,10 @@ def forward(self,
                 masked_positions=None,
                 use_cache=False,
                 cache=None):
+        input_ids.stop_gradient = True
+        position_ids.stop_gradient = True
+        attention_mask.stop_gradient = True
+
         outputs = self.gpt(input_ids,
                            position_ids=position_ids,
                            attention_mask=attention_mask,
@@ -936,40 +743,42 @@ def forward(self,
         x = encoder_outputs
         w = self.gpt.embeddings.word_embeddings.weight
 
-        mesh = _global_process_mesh
-        x_dims_mapping = [-1 for i in range(len(x.shape))]
-        w_dims_mapping = [-1 for i in range(len(w.shape))]
+        mesh = None
         if _global_parallel_strategy == "pp":
             mesh = PP_MESH_LIST[-1]
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = [None for i in range(len(w.shape))]
         elif _global_parallel_strategy == "dp":
-            x_dims_mapping = [0] + [-1 for i in range(len(x.shape) - 1)]
+            mesh = _global_process_mesh
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = [None for i in range(len(w.shape))]
         elif _global_parallel_strategy == "mp":
-            w_dims_mapping = [0] + [-1 for i in range(len(w.shape) - 1)]
+            mesh = _global_process_mesh
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = ["x"] + [None for i in range(len(w.shape) - 1)]
         elif _global_parallel_strategy == "dp_mp":
-            x_dims_mapping = [0] + [-1 for i in range(len(x.shape) - 1)]
-            w_dims_mapping = [1] + [-1 for i in range(len(w.shape) - 1)]
+            mesh = _global_process_mesh
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = ["y"] + [None for i in range(len(w.shape) - 1)]
         elif _global_parallel_strategy == "dp_pp":
             mesh = DPPP_MESH_LIST[-1]
-            x_dims_mapping = [0] + [-1 for i in range(len(x.shape) - 1)]
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = [None for i in range(len(w.shape))]
         elif _global_parallel_strategy == "mp_pp":
             mesh = MPPP_MESH_LIST[-1]
-            w_dims_mapping = [0] + [-1 for i in range(len(w.shape) - 1)]
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = ["x"] + [-1 for i in range(len(w.shape) - 1)]
         elif _global_parallel_strategy == "dp_mp_pp":
             mesh = DPMPPP_MESH_LIST[-1]
-            x_dims_mapping = [0] + [-1 for i in range(len(x.shape) - 1)]
-            w_dims_mapping = [1] + [-1 for i in range(len(w.shape) - 1)]
-
-        matmul = auto.shard_op(paddle.matmul,
-                               dist_attr={
-                                   'process_mesh': mesh,
-                                   x: {
-                                       "dims_mapping": x_dims_mapping
-                                   },
-                                   w: {
-                                       "dims_mapping": w_dims_mapping
-                                   }
-                               })
-        logits = matmul(x, w, transpose_y=True)
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = ["y"] + [None for i in range(len(w.shape) - 1)]
+
+        if mesh:
+            matmul = auto.shard_op(paddle.matmul, mesh,
+                                   [x_dims_mapping, w_dims_mapping, None])
+            logits = matmul(x, w, transpose_y=True)
+        else:
+            logits = paddle.matmul(x, w, transpose_y=True)
 
         if use_cache:
             return logits, cached_kvs
@@ -988,25 +797,29 @@ def __init__(self):
         self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
 
     def forward(self, prediction_scores, masked_lm_labels, loss_mask):
+        masked_lm_labels.stop_gradient = True
+        loss_mask.stop_gradient = True
 
-        mesh = _global_process_mesh
-        dims_mapping = [-1 for i in range(len(loss_mask.shape))]
+        mesh = None
         if _global_parallel_strategy == "dp":
-            dims_mapping = [0] + [-1 for i in range(len(loss_mask.shape) - 1)]
+            mesh = _global_process_mesh
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
         elif _global_parallel_strategy == "dp_mp":
-            dims_mapping = [0] + [-1 for i in range(len(loss_mask.shape) - 1)]
+            mesh = _global_process_mesh
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
         elif _global_parallel_strategy == "dp_pp":
             mesh = DPPP_MESH_LIST[-1]
-            dims_mapping = [0] + [-1 for i in range(len(loss_mask.shape) - 1)]
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
         elif _global_parallel_strategy == "dp_mp_pp":
             mesh = DPMPPP_MESH_LIST[-1]
-            dims_mapping = [0] + [-1 for i in range(len(loss_mask.shape) - 1)]
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
 
-        auto.shard_tensor(loss_mask,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": dims_mapping
-                          })
+        if mesh:
+            auto.shard_tensor(loss_mask, mesh, dims_mapping)
 
         masked_lm_loss = self.loss_func(prediction_scores,
                                         masked_lm_labels.unsqueeze(2))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 12f4cc08b0874..e98577f8458b8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -64,38 +64,18 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, "x"])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -119,28 +99,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -183,7 +147,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
         place = paddle.set_device("gpu")
@@ -230,7 +194,7 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
 
@@ -278,11 +242,11 @@ def test_mlp_pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
index 76698a7a8b5fd..c8e1b3965228d 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
@@ -433,6 +433,42 @@ def init_data(self):
         ]
 
 
+class TestCastPJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'cast_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        self.prim_input = {
+            'X': X,
+        }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'dtype': paddle.float64}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: True
+        Y_BAR = paddle.static.data(name='Y_BAR', shape=[5, 6], dtype='float')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X}
+
+        self.all_ops = [
+            # prim op:
+            'cast_p',
+            # jvp op:
+            'cast_p',
+            # transpose op:
+            'cast_p'
+        ]
+
+
 class TestLogPJVPAndTranspose(TestAddPJVPAndTranspose):
 
     def init_data(self):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
index 92a50d8bb1b08..e1d5ee11a13ac 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
@@ -110,6 +110,26 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestElementWiseDivOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'elementwise_div'
+        X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[8, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['elementwise_div', 'div_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
 class TestMatmulV2Orig2Prim(TestElementWiseAddOrig2Prim):
 
     def init_data(self):
@@ -786,5 +806,78 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestSizeOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'size'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'Input': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(
+                dtype=paddle.int64)
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['size', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestCastOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'cast'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'in_dtype': X.dtype, 'out_dtype': paddle.float64}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['cast', 'cast_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestPowScalarOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'pow'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'factor': 2.}
+        self.orig2prim_args = (None, X)
+        self.all_ops = ['pow', 'pow_p', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestSquareOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'square'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['square', 'pow_p', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
index c173cc4790dc2..a89b91bdd2b64 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
@@ -670,5 +670,25 @@ def init_data(self):
         self.out_map = {self.output['Z']: 0}
 
 
+class TestCastPPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'cast_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'dtype': paddle.int64}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['cast_p', 'cast']
+        self.out_map = {self.output['Y']: 0}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
index d010e69e75950..bdc54563fc8d2 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
@@ -257,6 +257,8 @@ def test_illegal_param(self):
          (np.random.rand(2, 3), np.random.rand(3, 2)), None, 'float32'),
         ('multiply', paddle.multiply,
          (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float64'),
+        ('div', paddle.divide,
+         (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float64'),
         ('add', paddle.add,
          (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float32'),
         ('input_not_sequence', paddle.tanh,
@@ -300,7 +302,21 @@ def test_illegal_param(self):
          (np.random.rand(200, 345), ), None, 'float32'),
         ('abs', paddle.abs, (np.random.uniform(-10, 10,
                                                (200, 345)), ), None, 'float32'),
-    ))
+        ('cast_float', lambda x: paddle.cast(x, paddle.float64),
+         (np.random.rand(10, 20), ), None, 'float32'),
+        ('cast_int', lambda x: paddle.cast(x, paddle.int32),
+         (np.random.rand(10, 20), ), None, 'float32'),
+        ('square', paddle.square, (np.random.rand(100), ), None, 'float32'),
+        ('pow_scalar', lambda x: paddle.pow(x, 2),
+         (np.random.rand(20, 30), ), None, 'float32'),
+        ('var', paddle.var, (np.random.rand(200, 324), ), None, 'float32'),
+        ('var_with_axis', lambda x: paddle.var(x, axis=1),
+         (np.random.rand(10, 20, 30), ), None, 'float32'),
+        ('var_without_unbiased',
+         lambda x: paddle.var(x, axis=1, unbiased=False),
+         (np.random.rand(10, 20, 30), ), None, 'float32'),
+        ('var_with_keepdim', lambda x: paddle.var(x, axis=1, keepdim=True),
+         (np.random.rand(10, 20, 30), ), None, 'float32')))
 class TestGrad(unittest.TestCase):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
index ba6f094e68008..35291432f6e8f 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -44,6 +44,9 @@
         ('erf', primops.erf, randn(2, 3), {}, (2, 3), 'float64'),
         ('abs', primops.abs, randn(2, 3), {}, (2, 3), 'float64'),
         ('log', primops.log, randn(2, 3), {}, (2, 3), 'float64'),
+        ('cast', primops.cast, randn(2, 3), {
+            'dtype': paddle.int64
+        }, (2, 3), 'int64'),
         ('reshape', primops.reshape, randn(2, 3), {
             'shape': (3, 2)
         }, (3, 2), 'float64'),
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 5a1a6df2dd7ec..55f4453b1ab08 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -268,17 +268,26 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_eager_dist_api MODULES test_eager_dist_api ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
-                                                      "RUN_TYPE=DIST")
+    test_communication_stream_allreduce_api MODULES
+    test_communication_stream_allreduce_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_allreduce_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_new_group_api MODULES test_new_group_api ENVS
+    test_communication_stream_sendrecv_api MODULES
+    test_communication_stream_sendrecv_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_sendrecv_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_eager_dist_api MODULES test_eager_dist_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
-                                                     "RUN_TYPE=DIST")
+  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
+                                                      "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU
     OR WITH_ROCM
@@ -298,11 +307,10 @@ if((WITH_GPU
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_communication_stream_allreduce_api MODULES
-    test_communication_stream_allreduce_api ENVS
-    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
-  set_tests_properties(test_communication_stream_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+    test_new_group_api MODULES test_new_group_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
+                                                     "RUN_TYPE=DIST")
 endif()
 if((WITH_ROCM OR WITH_GPU) AND (LINUX))
   bash_test_modules(
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
new file mode 100644
index 0000000000000..175e24c3d0d86
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import test_collective_api_base as test_collective_base
+import test_communication_api_base as test_base
+
+
+class StreamSendRecvTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        if rank == 0:
+            task = dist.stream.send(tensor,
+                                    dst=1,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        else:
+            task = dist.stream.recv(tensor,
+                                    src=0,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        result = test_data_list[0]
+        assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamSendRecvTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
index 688a31b78de00..2aa113b55d5c9 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
@@ -82,11 +82,7 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mappig": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _global_process_mesh, [None, None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -106,7 +102,7 @@ class TestMLPAutoParallelizer(unittest.TestCase):
     def test_mlp_serial(self):
 
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.amp = False
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
new file mode 100644
index 0000000000000..9590519bc2e13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamSendRecvAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamSendRecvAPI, self).setUp(num_of_devices=2,
+                                                              timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_sendrecv_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_sendrecv_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamSendRecvAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index 16eb200565f73..b4ba281f45420 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -32,8 +32,9 @@ test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
index ec879e77611cd..3091a927a8224 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
@@ -86,7 +86,7 @@ def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
                                          paddle.static.Program()):
             with paddle.static.scope_guard(scope):
                 with paddle.fluid.unique_name.guard():
-                    main_prog, startup_prog, inputs, outputs, reader = self.get_model(
+                    main_prog, startup_prog, inputs, outputs, data_loader = self.get_model(
                         place, **kwargs)
                     inputs = self._to_var_names(inputs)
                     outputs = self._to_var_names(outputs)
@@ -95,27 +95,57 @@ def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
         exe = paddle.static.Executor(place)
         with paddle.static.scope_guard(scope):
             exe.run(startup_prog)
-            for batch_id, input_data in enumerate(reader()):
-                assert len(input_data) == len(inputs), "{} vs {}".format(
-                    len(input_data), len(inputs))
-                feed = dict(zip(inputs, input_data))
-                fetch_values = exe.run(main_prog, feed=feed, fetch_list=outputs)
-                if paddle.distributed.get_rank() == 0:
-                    output_dict = OrderedDict(zip(outputs, fetch_values))
-                    print('batch {}, outputs {}'.format(batch_id, output_dict))
-                all_fetch_values.append(fetch_values)
+            data_loader.start()
+            batch_id = 0
+            while True:
+                try:
+                    fetch_values = exe.run(main_prog, fetch_list=outputs)
+                    if paddle.distributed.get_rank() == 0:
+                        output_dict = OrderedDict(zip(outputs, fetch_values))
+                        print('batch {}, outputs {}'.format(
+                            batch_id, output_dict))
+                    all_fetch_values.append(fetch_values)
+                    batch_id += 1
+                except paddle.fluid.core.EOFException:
+                    data_loader.reset()
+                    break
         with open(dump_file, "wb") as f:
             pickle.dump(all_fetch_values, f)
 
     def get_gpt_model(self, strategy, place, batch_size, sequence_len,
                       vocab_size, **kwargs):
+
+        def gen_data():
+            np.random.seed(2021)
+            for _ in range(10):
+                tokens = []
+                position_ids = []
+                attention_mask = []
+                labels = []
+                loss_mask = []
+                for _ in range(batch_size):
+                    tokens.append(
+                        np.random.randint(vocab_size,
+                                          size=sequence_len).astype("int64"))
+                    position_ids.append(np.arange(sequence_len).astype("int64"))
+                    attention_mask.append(
+                        [np.tril(np.ones(sequence_len)).astype("float32")])
+                    labels.append(
+                        np.random.randint(vocab_size,
+                                          size=sequence_len).astype("int64"))
+                    loss_mask.append(np.ones(sequence_len).astype("float32"))
+
+                yield tokens, position_ids, attention_mask, labels, loss_mask
+
         modeling.init_global()
         if strategy == "dp":
             modeling._global_parallel_strategy = "dp"
-            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1],
+                                                             dim_names=["x"])
         elif strategy == "mp":
             modeling._global_parallel_strategy = "mp"
-            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1],
+                                                             dim_names=["x"])
         else:
             raise ValueError("'get_gpt_model' only support dp and mp.")
 
@@ -137,23 +167,17 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
                                        dtype='float32')
         data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
 
+        data_loader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=70, iterable=False)
+        data_loader.set_batch_generator(gen_data, paddle.static.cuda_places())
+
         if modeling._global_parallel_strategy == "dp":
-            auto.shard_tensor(tokens,
-                              dist_attr={
-                                  "process_mesh": modeling._global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(tokens, modeling._global_process_mesh,
+                              ["x", None])
         elif modeling._global_parallel_strategy == "pp":
-            auto.shard_tensor(tokens,
-                              dist_attr={
-                                  "process_mesh": modeling.PP_MESH_LIST[0],
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(attention_mask,
-                              dist_attr={
-                                  "process_mesh": modeling.PP_MESH_LIST[0],
-                                  "dims_mapping": [-1, -1, -1, -1]
-                              })
+            auto.shard_tensor(tokens, modeling.PP_MESH_LIST[0], [None, None])
+            auto.shard_tensor(attention_mask, modeling.PP_MESH_LIST[0],
+                              [None, None, None, None])
 
         gpt = GPTModel(vocab_size=1000,
                        hidden_size=64,
@@ -178,40 +202,21 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
         preds = model(tokens, position_ids, attention_mask)
         criterion = GPTPretrainingCriterion()
         loss = criterion(preds, labels, loss_mask)
-        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
 
+        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
         if kwargs.get('optimizer', None) == "LarsMomentum":
             optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
                 learning_rate=0.001, momentum=0.9)
         else:
-            optimizer = paddle.fluid.optimizer.AdamOptimizer(
-                learning_rate=0.00001,
-                beta1=0.9,
-                beta2=0.999,
-                epsilon=1e-08,
-                grad_clip=clip)
+            optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
+                                              beta1=0.9,
+                                              beta2=0.999,
+                                              epsilon=1e-08,
+                                              grad_clip=clip)
         optimizer = fleet.distributed_optimizer(optimizer)
         startup_program = paddle.static.default_startup_program()
         _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
             loss, startup_program)
 
-        def gen_data():
-            np.random.seed(2021)
-            for _ in range(10):
-                tokens = []
-                position_ids = []
-                attention_mask = []
-                labels = []
-                loss_mask = []
-                for _ in range(batch_size):
-                    tokens.append(
-                        np.random.randint(vocab_size, size=sequence_len))
-                    position_ids.append(np.arange(sequence_len))
-                    attention_mask.append([np.tril(np.ones(sequence_len))])
-                    labels.append(
-                        np.random.randint(vocab_size, size=sequence_len))
-                    loss_mask.append(np.ones(sequence_len))
-
-                yield tokens, position_ids, attention_mask, labels, loss_mask
-
-        return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
+        return dist_main_prog, dist_startup_prog, data_holder, [loss
+                                                                ], data_loader
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
index 5ac78cc5fec4d..4c20153ccbfd9 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
@@ -20,10 +20,19 @@
 import paddle
 import paddle.distributed.fleet as fleet
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
-from test_auto_parallel_amp_pass import TestAMPPass
 
 
-class TestPF16Pass(TestAMPPass):
+class TestPF16Pass(AutoPallelPassTestBase):
+
+    def init(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        self.rtol = 1e-5
+        self.atol = 1e-8
+
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
 
     def apply_passes(self):
         dist_strategy = fleet.DistributedStrategy()
@@ -34,14 +43,30 @@ def apply_passes(self):
                 'layer_norm',
                 'gelu',
             ],
-            "custom_black_list": ['c_softmax_with_cross_entropy'],
-            "init_loss_scaling": 32768,
-            "use_dynamic_loss_scaling": True,
-            "use_pure_fp16": True
+            "custom_black_list":
+            ['c_softmax_with_cross_entropy', 'elementwise_div', 'reduce_sum'],
+            "init_loss_scaling":
+            32768,
+            "use_dynamic_loss_scaling":
+            True,
+            "use_pure_fp16":
+            True,
+            "use_fp16_guard":
+            False
         }
         dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
 
+    def test_bs_8(self):
+        self.check_main(gpus=[0, 1],
+                        batch_size=8,
+                        sequence_len=512,
+                        vocab_size=1000)
+
+    def get_model(self, place, batch_size, sequence_len, vocab_size):
+        return self.get_gpt_model("mp", place, batch_size, sequence_len,
+                                  vocab_size)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index 50e1871820186..8f45b67090e93 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -97,11 +97,8 @@ def forward(self, input):
 
 
 def mlp_forward(input, label, hidden_size):
-    auto.shard_tensor(input,
-                      dist_attr={
-                          "process_mesh": [0],
-                          "dims_mapping": [-1, -1]
-                      })
+    auto.shard_tensor(input, auto.ProcessMesh([0], dim_names=["x"]),
+                      [None, None])
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    initializer_range=0.02)
@@ -160,6 +157,12 @@ def test_result(self):
 
     def get_model(self, place, batch_size, hidden_size, max_step):
 
+        def gen_data():
+            for i in range(max_step):
+                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
+                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
+                yield x_data, y_data
+
         train_program = static.Program()
         startup_program = static.Program()
         with static.program_guard(train_program, startup_program), \
@@ -171,6 +174,12 @@ def get_model(self, place, batch_size, hidden_size, max_step):
                                 shape=[batch_size, 1],
                                 dtype='float32')
             input.stop_gradient = False
+            data_holder = [input, label]
+            data_loader = paddle.fluid.io.DataLoader.from_generator(
+                feed_list=data_holder, capacity=70, iterable=False)
+            data_loader.set_batch_generator(gen_data,
+                                            paddle.static.cuda_places())
+
             loss = mlp_forward(input, label, hidden_size)
 
         optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01)
@@ -181,13 +190,8 @@ def get_model(self, place, batch_size, hidden_size, max_step):
         input_data = np.random.random(size=(128, hidden_size)).astype('float32')
         label_data = np.random.random(size=(128, 1)).astype('float32')
 
-        def reader():
-            for i in range(max_step):
-                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
-                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
-                yield x_data, y_data
-
-        return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
+        return dist_main_prog, dist_startup_prog, [input,
+                                                   label], [loss], data_loader
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
index 9343f1ebd7cd0..75b5cba9e81b3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
@@ -19,11 +19,15 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import os
 
 
 class TrtConvertGatherNdTest_dim_4_1(TrtLayerAutoScanTest):
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # The output has diff between gpu and trt in CI windows
+        # if ( and self.trt_param.precision == paddle_infer.PrecisionType.Half):
+        #     return False
         return True
 
     def sample_program_configs(self):
@@ -46,17 +50,19 @@ def generate_input2():
             "op_attrs": {}
         }]
         ops = self.generate_op_config(ops_config)
-
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={},
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input1)),
-                "index_data": TensorConfig(data_gen=partial(generate_input2)),
-            },
-            outputs=["output_data"])
-
-        yield program_config
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data":
+                    TensorConfig(data_gen=partial(generate_input1)),
+                    "index_data":
+                    TensorConfig(data_gen=partial(generate_input2)),
+                },
+                outputs=["output_data"])
+
+            yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
@@ -71,7 +77,7 @@ def generate_dynamic_shape(attrs):
                 "index_data": [1]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
+                "input_data": [2, 32, 64, 64],
                 "index_data": [1]
             }
 
@@ -94,11 +100,23 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -145,14 +163,14 @@ def sample_predictor_configs(
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1]
+                "index_data": [2]
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4]
+                "index_data": [2]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
+                "input_data": [2, 32, 64, 64],
                 "index_data": [2]
             }
 
@@ -175,11 +193,23 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -226,14 +256,14 @@ def sample_predictor_configs(
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1, 2]
+                "index_data": [2, 2]
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4, 4]
+                "index_data": [2, 2]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
+                "input_data": [2, 32, 64, 64],
                 "index_data": [2, 2]
             }
 
@@ -256,11 +286,23 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -307,15 +349,15 @@ def sample_predictor_configs(
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1, 2, 2]
+                "index_data": [2, 2, 4]
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4, 4, 4]
+                "index_data": [2, 2, 4]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
-                "index_data": [2, 2, 2]
+                "input_data": [2, 32, 64, 64],
+                "index_data": [2, 2, 4]
             }
 
         def clear_dynamic_shape():
@@ -337,11 +379,23 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -388,11 +442,11 @@ def sample_predictor_configs(
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 4],
-                "index_data": [1, 1]
+                "index_data": [2, 2]
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 64],
-                "index_data": [4, 2]
+                "index_data": [2, 2]
             }
             self.dynamic_shape.opt_input_shape = {
                 "input_data": [2, 8],
@@ -418,11 +472,23 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
+
+    def add_skip_trt_case(self):
+
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
index cebede99e6f82..fec4476939125 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
@@ -107,24 +107,30 @@ def generate_dynamic_shape(attrs):
             if attrs[0]['iou_aware'] == True:
                 channel = 3 * (attrs[0]['class_num'] + 6)
                 self.dynamic_shape.min_input_shape = {
-                    "scale_input": [1, channel, 12, 12]
+                    "yolo_box_input": [1, channel, 12, 12],
+                    "imgsize": [1, 2]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "scale_input": [4, channel, 24, 24]
+                    "yolo_box_input": [4, channel, 24, 24],
+                    "imgsize": [4, 2]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "scale_input": [1, channel, 24, 24]
+                    "yolo_box_input": [1, channel, 24, 24],
+                    "imgsize": [1, 2]
                 }
             else:
                 channel = 3 * (attrs[0]['class_num'] + 5)
                 self.dynamic_shape.min_input_shape = {
-                    "scale_input": [1, channel, 12, 12]
+                    "yolo_box_input": [1, channel, 12, 12],
+                    "imgsize": [1, 2]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "scale_input": [4, channel, 24, 24]
+                    "yolo_box_input": [4, channel, 24, 24],
+                    "imgsize": [4, 2]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "scale_input": [1, channel, 24, 24]
+                    "yolo_box_input": [1, channel, 24, 24],
+                    "imgsize": [1, 2]
                 }
 
         def clear_dynamic_shape():
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
index f62c5b47d5ab0..2bd374fe6d0e7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tril_triu_op_npu.py
@@ -135,7 +135,7 @@ def initTestCase(self):
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):
-    """ test case by using API and has -1 dimension 
+    """ test case by using API and has -1 dimension
     """
 
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
deleted file mode 100644
index f4a02679b3220..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.nn as nn
-import paddle.distributed as dist
-from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-
-paddle.enable_static()
-
-process_mesh1 = [0, 1, 2, 3]
-process_mesh2 = [[0, 1, 2], [3, 4, 5]]
-
-
-class SimpleNet(nn.Layer):
-
-    def __init__(self, vocab_size=128, hidden_size=4):
-        super(SimpleNet, self).__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
-        self.dense1 = nn.Linear(hidden_size, hidden_size)
-        self.dense2 = nn.Linear(hidden_size, hidden_size // 2)
-
-    def forward(self, x, y):
-        # Test shard_tensor interface with dist_attr arg
-        x = dist.shard_tensor(x,
-                              dist_attr={
-                                  "process_mesh": process_mesh1,
-                                  "dims_mapping": [0, -1]
-                              })
-        emb_out = self.word_embeddings(x)
-        # Test shard_tensor interface with no dist_attr arg
-        y = dist.shard_tensor(y)
-        linear1 = self.dense1(y)
-        out = self.dense2(linear1)
-
-        return x, y
-
-
-class TestAutoParallelAPI(unittest.TestCase):
-
-    def test_api(self):
-        dist_context = get_default_distributed_context()
-
-        net = SimpleNet()
-        data1 = fluid.layers.fill_constant(shape=[2, 4], value=1, dtype="int64")
-        data2 = fluid.layers.fill_constant(shape=[2, 4],
-                                           value=2,
-                                           dtype="float32")
-        data3 = fluid.layers.fill_constant(shape=[2, 4],
-                                           value=4,
-                                           dtype="float32")
-
-        x, y = net.forward(data1, data2)
-
-        dist_x = dist_context.get_dist_tensor_for_program(x)
-        self.assertEqual(dist_x.dist_attr.process_mesh.processes, process_mesh1)
-        self.assertEqual(dist_x.dist_attr.dims_mapping, [0, -1])
-        self.assertEqual(dist_x.dist_attr.shard_sizes, None)
-        self.assertEqual(dist_x.dist_attr.device_placement, None)
-        self.assertTrue(dist_x.dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(dist_x.dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(dist_x.dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(dist_x.dist_attr.is_annotated("device_placement"))
-
-        dist_y = dist_context.get_dist_tensor_for_program(y)
-        self.assertEqual(dist_y.dist_attr.process_mesh, None)
-        self.assertEqual(dist_y.dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(dist_y.dist_attr.shard_sizes, None)
-        self.assertEqual(dist_y.dist_attr.device_placement, None)
-        self.assertFalse(dist_y.dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("device_placement"))
-
-        # Test shard_op interface with dist_attr
-        dims_mapping1 = [0, 1]
-        dims_mapping2 = [-1, 0]
-        dist_add = dist.shard_op(paddle.add,
-                                 dist_attr={
-                                     data2: {
-                                         "process_mesh": process_mesh2,
-                                         "dims_mapping": dims_mapping1
-                                     },
-                                     data3: {
-                                         "dims_mapping": dims_mapping2
-                                     }
-                                 })
-        results = dist_add(data2, data3)
-        ops = paddle.static.default_main_program().block(0).ops
-        last_op = ops[-1]
-
-        dist_op = dist_context.get_dist_op_for_program(last_op)
-        self.assertEqual(dist_op.dist_attr.process_mesh,
-                         ProcessMesh(process_mesh2))
-        self.assertEqual(dist_op.dist_attr.impl_type, "default")
-        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
-        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
-
-        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
-        self.assertEqual(data2_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data2_dist_attr.dims_mapping, dims_mapping1)
-        self.assertEqual(data2_dist_attr.shard_sizes, None)
-        self.assertEqual(data2_dist_attr.device_placement, None)
-        self.assertTrue(data2_dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(data2_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
-
-        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
-        self.assertEqual(data3_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data3_dist_attr.dims_mapping, dims_mapping2)
-        self.assertEqual(data3_dist_attr.shard_sizes, None)
-        self.assertEqual(data3_dist_attr.device_placement, None)
-        self.assertTrue(data3_dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(data3_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
-
-        # Test shard_op interface with dist_attr
-        dist_add = dist.shard_op(paddle.add)
-        results = dist_add(data2, data3)
-        ops = paddle.static.default_main_program().block(0).ops
-        last_op = ops[-1]
-        dist_op = dist_context.get_dist_op_for_program(last_op)
-        self.assertEqual(dist_op.dist_attr.process_mesh, None)
-        self.assertEqual(dist_op.dist_attr.impl_type, "default")
-        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
-        self.assertFalse(dist_op.dist_attr.is_annotated("process_mesh"))
-
-        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
-        self.assertEqual(data2_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data2_dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(data2_dist_attr.shard_sizes, None)
-        self.assertEqual(data2_dist_attr.device_placement, None)
-        self.assertFalse(data2_dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(data2_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
-
-        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
-        self.assertEqual(data3_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data3_dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(data3_dist_attr.shard_sizes, None)
-        self.assertEqual(data3_dist_attr.device_placement, None)
-        self.assertFalse(data3_dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(data3_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 393d79557a927..e07cc5cef93ad 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -66,39 +66,13 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
-        elif _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh2,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -119,18 +93,10 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, hidden_size],
                             dtype='float32')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -146,7 +112,8 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -161,7 +128,8 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -177,8 +145,9 @@ def test_mlp_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -286,18 +255,10 @@ def __init__(self,
                                   bias_attr=bias_attr)
 
     def forward(self, input):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -306,38 +267,16 @@ def forward(self, input):
         k = self.k_proj(input)
         v = self.v_proj(input)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -369,18 +308,10 @@ def forward(self, input):
 
         # project to output
         out = self.out_proj(out)
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         return out
 
@@ -411,7 +342,8 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -420,15 +352,14 @@ def test_attn_dp(self):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -444,8 +375,9 @@ def test_attn_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -542,34 +474,18 @@ def __init__(self,
         self.dropout3 = nn.Dropout(self.dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input_ids, position_ids):
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -585,38 +501,16 @@ def forward(self, input_ids, position_ids):
         k = self.k_proj(target)
         v = self.v_proj(target)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -649,18 +543,10 @@ def forward(self, input_ids, position_ids):
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -673,28 +559,13 @@ def forward(self, input_ids, position_ids):
         out2 = F.gelu(out1, approximate=True)
         out3 = self.linear1(out2)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -732,7 +603,8 @@ def test_decoder_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -747,7 +619,8 @@ def test_decoder_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -763,8 +636,9 @@ def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index ab110c929f5c5..088b7b636c418 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -116,18 +116,10 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -158,34 +150,15 @@ def compute_kv(self, key, value):
         to construct cache for inference.
         """
         k = self.k_proj(key)
-
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
         v = self.v_proj(value)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
+            auto.shard_tensor(self.k_proj.weight,
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -265,18 +238,10 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         outs = [out]
         if self.need_weights:
@@ -439,31 +404,13 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
-        if _global_parallel_strategy == "mp":
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -523,18 +470,10 @@ def forward(self, input_ids, position_ids=None):
 
         input_embedings = self.word_embeddings(input_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -757,18 +696,10 @@ def gpt_pretrain_forward(train_program, start_program):
                                 shape=[batch_size, sequence_len],
                                 dtype='float64')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         gpt = GPTModel(vocab_size=32768,
                        hidden_size=1024,
@@ -801,7 +732,8 @@ def test_gpt_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -817,7 +749,8 @@ def test_gpt_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -833,8 +766,9 @@ def test_gpt_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index bb8642d569e42..7b48b921d5cec 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -35,8 +35,8 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 NUM_RANKS = 8
 STAGE_0_CNT = 5
 STAGE_1_CNT = 10
@@ -73,16 +73,8 @@ def __init__(self,
 
     def forward(self, input):
         if self.is_distributed:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -135,16 +127,8 @@ def mlp_forward(train_program, start_program, is_distributed=True):
                                 dtype='float32')
 
         if is_distributed:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [0, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, ["x", None])
+            auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index ca69535049c3b..63586c234b355 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -71,7 +71,7 @@ class TestDistributedTensor(unittest.TestCase):
 
     def test_new_local_tensor(self):
         test_auto_parallel_reshard._global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1])
+            mesh=[0, 1], dim_names=["x"])
         test_auto_parallel_reshard._global_parallel_strategy = "dp"
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 97855c8a8f156..7cc6b64894ebc 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -414,37 +414,25 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [1, -1]
-                              })
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[1],
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear3.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[1],
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh[0],
+                              [None, "y"])
+
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh[0],
+                              ["y", None])
+
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh[1],
+                              [None, "y"])
+
+            auto.shard_tensor(self.linear3.weight, _global_process_mesh[1],
+                              ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
         out = self.linear1(out)
 
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(out, _global_process_mesh[1], ["x", None])
+
         out = self.linear2(out)
         out = F.gelu(out, approximate=True)
         out = self.linear3(out)
@@ -464,11 +452,7 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh[0], ["x", None])
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
                        initializer_range=0.02)
@@ -548,7 +532,10 @@ def test_mapper_dp_mp_pp(self):
         global _global_num_stages
         _global_num_stages = 2
         global _global_process_mesh
-        _global_process_mesh = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
+        _global_process_mesh = [
+            auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+            auto.ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
+        ]
         processes = [0, 1, 2, 3, 4, 5, 6, 7]
 
         dist_programs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index 80135b6288531..af0f48e067649 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -276,39 +276,20 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
         else:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -329,18 +310,10 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, hidden_size],
                             dtype='float32')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -356,7 +329,8 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
@@ -391,7 +365,8 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -453,8 +428,9 @@ def test_mlp_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -558,18 +534,10 @@ def __init__(self,
                                   bias_attr=bias_attr)
 
     def forward(self, input):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -578,38 +546,16 @@ def forward(self, input):
         k = self.k_proj(input)
         v = self.v_proj(input)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -641,18 +587,11 @@ def forward(self, input):
 
         # project to output
         out = self.out_proj(out)
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         return out
 
@@ -683,7 +622,8 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -717,7 +657,8 @@ def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -783,8 +724,9 @@ def test_attn_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -930,34 +872,18 @@ def __init__(self,
         self.dropout3 = nn.Dropout(self.dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input_ids, position_ids):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -973,38 +899,16 @@ def forward(self, input_ids, position_ids):
         k = self.k_proj(target)
         v = self.v_proj(target)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -1037,24 +941,14 @@ def forward(self, input_ids, position_ids):
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
         else:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -1067,28 +961,13 @@ def forward(self, input_ids, position_ids):
         out2 = F.gelu(out1, approximate=True)
         out3 = self.linear1(out2)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -1126,8 +1005,9 @@ def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
@@ -1208,8 +1088,9 @@ def test_decoder_noparallel(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "None"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["x", "y"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 00ba2151fcba5..b01959af2986e 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -163,18 +163,10 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -205,34 +197,15 @@ def compute_kv(self, key, value):
         to construct cache for inference.
         """
         k = self.k_proj(key)
-
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
         v = self.v_proj(value)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
+            auto.shard_tensor(self.k_proj.weight,
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -312,18 +285,10 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         outs = [out]
         if self.need_weights:
@@ -486,31 +451,13 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
-        if _global_parallel_strategy == "mp":
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -570,18 +517,10 @@ def forward(self, input_ids, position_ids=None):
 
         input_embedings = self.word_embeddings(input_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -804,18 +743,10 @@ def gpt_pretrain_forward(train_program, startup_program):
                                 shape=[batch_size, sequence_len],
                                 dtype='float64')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         gpt = GPTModel(vocab_size=32768,
                        hidden_size=768,
@@ -863,8 +794,9 @@ def test_gpt_dp_mp(self):
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
 
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         startup_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 51926286acc15..140ed2dae61eb 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -63,27 +63,13 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         else:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -107,28 +93,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         else:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -296,11 +266,11 @@ def test_mlp_pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -325,11 +295,11 @@ def test_mlp_pp_diff_process_mesh(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -352,7 +322,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 33396f283ec0e..f77e0db3450e2 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -34,9 +34,10 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]],
+                                        dim_names=["x", "y", "z"])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 
 
 class MLPLayer(nn.Layer):
@@ -63,16 +64,8 @@ def __init__(self,
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 1]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "y"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -80,11 +73,7 @@ def forward(self, input):
         out = self.linear1(out)
         param = paddle.fluid.layers.create_parameter([1024, 4096],
                                                      paddle.float32)
-        auto.shard_tensor(param,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(param, PP_MESH_1, [None, "y"])
         out = paddle.fluid.layers.mul(out, param)
 
         return out
@@ -103,16 +92,8 @@ def mlp_forward(train_program, start_program):
                             shape=[batch_size, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, ["x", None])
+        auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index d5de1c1287331..c9dbc77da8a78 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -34,9 +34,9 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "mp_pp"
-_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]])
-PP_MESH_0 = auto.ProcessMesh([0, 1])
-PP_MESH_1 = auto.ProcessMesh([2, 3])
+_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+PP_MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
+PP_MESH_1 = auto.ProcessMesh([2, 3], dim_names=["x"])
 
 
 class MLPLayer(nn.Layer):
@@ -73,35 +73,15 @@ def __init__(self,
                                  bias_attr=bias_attr)
 
     def forward(self, input):
-        auto.shard_tensor(self.word_embeddings.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 0]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear2.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.word_embeddings.weight, PP_MESH_0, ["x", None])
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "x"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["x", None])
+        auto.shard_tensor(self.linear2.weight, PP_MESH_1, ["x", None])
         w_out = self.word_embeddings(input)
         out = self.linear0(w_out)
         param = paddle.fluid.layers.create_parameter([4096, 4096],
                                                      paddle.float32)
-        auto.shard_tensor(param,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(param, PP_MESH_0, ["x", None])
         out = paddle.fluid.layers.mul(out, param)
         gelu_out = F.gelu(out, approximate=True)
         out = self.linear1(gelu_out)
@@ -122,16 +102,8 @@ def mlp_forward(train_program, start_program):
                             shape=[batch_size, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [-1, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, [None])
+        auto.shard_tensor(label, PP_MESH_1, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -238,7 +210,6 @@ def test_mlp_mppp(self):
         resharder = Resharder(dist_main_prog, dist_startup_prog, rank_id,
                               dist_context, dist_params_grads)
         resharder.reshard()
-        print_program_with_dist_attr(dist_main_prog, dist_context)
         # check send and recv result
         self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
 
@@ -249,32 +220,15 @@ def test_mlp_mppp(self):
     def test_allgather(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         with static.program_guard(train_program, startup_program):
             x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
-            x = auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": process_mesh,
-                                      "dims_mapping": [0, -1]
-                                  })
-
+            x = auto.shard_tensor(x, process_mesh, ["x", None])
             w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
-            w = auto.shard_tensor(w,
-                                  dist_attr={
-                                      "process_mesh": process_mesh,
-                                      "dims_mapping": [-1, -1]
-                                  })
-
-            y = paddle.distributed.shard_op(paddle.matmul,
-                                            dist_attr={
-                                                "process_mesh": process_mesh,
-                                                x: {
-                                                    "dims_mapping": [-1, -1]
-                                                },
-                                                w: {
-                                                    "dims_mapping": [-1, -1]
-                                                }
-                                            })(x, w)
+            w = auto.shard_tensor(w, process_mesh, [None, None])
+
+            y = paddle.distributed.shard_op(paddle.matmul, process_mesh,
+                                            [[None, None], [None, None]])(x, w)
 
         rank_id = 0
         dist_context = DistributedContext()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index 64ff030f5b1e2..e255bcbcc0096 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -62,27 +62,13 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         else:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -106,28 +92,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         else:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -196,7 +166,7 @@ def test_mlp_serial(self):
         global _global_parallel_strategy
         _global_parallel_strategy = None
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 5ab219df5ec8b..01b684b09abe7 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -245,7 +245,7 @@ def test_normal_initializer(self, dtype="float32"):
                                    name="param",
                                    initializer=initializer.NormalInitializer(
                                        2.3, 1.9, 123))
-        num_ops = 2 if (dtype == "float16" or dtype == "uint16") else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -390,7 +390,6 @@ def test_xavier_initializer_fp16(self):
         """Test the Xavier initializer with float16
         """
         block = self.test_xavier_initializer_supplied_arguments("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_xavier_initializer_bf16(self):
         """Test the Xavier initializer with bfloat16
@@ -400,7 +399,6 @@ def test_xavier_initializer_bf16(self):
         self.assertEqual(len(block_uniform.ops), 1)
         block_gaussian = self.test_xavier_initializer_supplied_arguments(
             "uint16", False)
-        self.assertTrue(check_cast_op(block_gaussian.ops[1]))
 
 
 class TestMSRAInitializer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 0f4a2e7a67c6b..9e02eb3e7701b 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -398,7 +398,7 @@ def test_normal_initializer(self, dtype="float32"):
                                    lod_level=0,
                                    name="param",
                                    initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 507083755c061..da89cbf33c3ff 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -1740,6 +1740,40 @@ def test_jit_save_incompatible_input_sepc(self):
             shutil.rmtree(save_dir)
 
 
+class NotJitForward(paddle.nn.Layer):
+
+    def __init__(self):
+        super(NotJitForward, self).__init__()
+
+    def forward(self, x, y):
+        return x + y
+
+
+class TestNotJitForward(unittest.TestCase):
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_jit_not_save_forward(self):
+        layer = NotJitForward()
+
+        save_dir = os.path.join(self.temp_dir.name, "jit_not_save_forward")
+        path = save_dir + "/model"
+
+        paddle.jit.save(layer=layer, path=path, skip_forward=True)
+
+        self.assertTrue(not os.path.exists(path + ".pdmodel"))
+        self.assertTrue(not os.path.exists(path + ".pdparam"))
+
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path=path)
+
+        shutil.rmtree(save_dir)
+
+
 if __name__ == '__main__':
     with fluid.framework._test_eager_guard():
         unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_saved_tensors_hooks.py b/python/paddle/fluid/tests/unittests/test_saved_tensors_hooks.py
new file mode 100644
index 0000000000000..7d2791d305899
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_saved_tensors_hooks.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+from paddle.autograd import PyLayer
+
+
+class TestSavedTensorsHooks(unittest.TestCase):
+
+    def test_save_for_multiply(self):
+
+        def pack_hook(x):
+            return x.numpy()
+
+        def unpack_hook(x):
+            return paddle.to_tensor(x)
+
+        a = paddle.ones([3, 3])
+        b = paddle.ones([3, 3]) * 2
+        a.stop_gradient = False
+        b.stop_gradient = False
+        with paddle.autograd.saved_tensors_hooks(pack_hook, unpack_hook):
+            y = paddle.multiply(a, b)
+        y.sum().backward()
+
+        aa = paddle.ones([3, 3])
+        bb = paddle.ones([3, 3]) * 2
+        aa.stop_gradient = False
+        bb.stop_gradient = False
+        yy = paddle.multiply(aa, bb)
+        yy.sum().backward()
+
+        self.assertTrue(paddle.equal_all(aa.grad, a.grad))
+        self.assertTrue(paddle.equal_all(bb.grad, b.grad))
+
+    def test_save_for_pylayer(self):
+
+        class cus_multiply(PyLayer):
+
+            @staticmethod
+            def forward(ctx, a, b):
+                y = paddle.multiply(a, b)
+                ctx.save_for_backward(a, b)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy):
+                a, b = ctx.saved_tensor()
+                grad_a = dy * a
+                grad_b = dy * b
+                return grad_a, grad_b
+
+        def pack_hook(x):
+            return x.numpy()
+
+        def unpack_hook(x):
+            return paddle.to_tensor(x)
+
+        a = paddle.ones([3, 3])
+        b = paddle.ones([3, 3]) * 2
+        a.stop_gradient = False
+        b.stop_gradient = False
+        with paddle.autograd.saved_tensors_hooks(pack_hook, unpack_hook):
+            y = cus_multiply.apply(a, b)
+        y.sum().backward()
+
+        aa = paddle.ones([3, 3])
+        bb = paddle.ones([3, 3]) * 2
+        aa.stop_gradient = False
+        bb.stop_gradient = False
+        yy = cus_multiply.apply(aa, bb)
+        yy.sum().backward()
+
+        self.assertTrue(paddle.equal_all(aa.grad, a.grad))
+        self.assertTrue(paddle.equal_all(bb.grad, b.grad))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index fee36c0d42151..bc10e4f4fdeb4 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -82,6 +82,33 @@ def config(self):
         self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
+class TestSliceZerosShapeTensor(OpTest):
+
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'use_mkldnn': True
+        }
+
+    def config(self):
+        self.input = np.random.random([0, 0, 0]).astype("float32")
+        self.starts = [1]
+        self.ends = [2]
+        self.axes = [0]
+        self.infer_flags = []
+        self.out = self.input[1:2]
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.CPUPlace())
+
+
 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
 
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
index dde3fb492cd97..636dc89220490 100644
--- a/python/paddle/incubate/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -382,3 +382,15 @@ def max(x, y, out=None):
 @REGISTER_FN('erf_p', 'X', 'Y')
 def erf(x, out=None):
     return _simple_unop(LayerHelper('erf_p', **locals()))
+
+
+@REGISTER_FN('cast_p', 'X', 'Y')
+def cast(x, dtype, out=None):
+    helper = LayerHelper('cast_p', **locals())
+    if out is None:
+        out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'X': x},
+                     outputs={'Y': out},
+                     attrs={'dtype': dtype})
+    return out
diff --git a/python/paddle/incubate/autograd/primreg.py b/python/paddle/incubate/autograd/primreg.py
index 4721500b2be09..7972409d9356c 100644
--- a/python/paddle/incubate/autograd/primreg.py
+++ b/python/paddle/incubate/autograd/primreg.py
@@ -80,7 +80,7 @@ def div(x, y, out=None):
 
     """
     args = _primop_position_argnames.lookup(op.type)
-    assert args is not None, 'args should not be None in op_position_inputs().'
+    assert args is not None, f'args of {op.type} should not be None in op_position_inputs().'
     *input_names, _ = args
 
     inputs = []
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
index 954bdf0cb1c86..4625cfd362f07 100644
--- a/python/paddle/incubate/autograd/primrules.py
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -158,6 +158,13 @@ def elementwise_mul_orig2prim(op, x, y):
     return z
 
 
+@REGISTER_ORIG2PRIM('elementwise_div')
+def elementwise_div_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    return primops.div(x, y)
+
+
 @REGISTER_ORIG2PRIM('tanh')
 def tanh_orig2prim(op, x):
     return tanh(x)
@@ -322,6 +329,11 @@ def num_el(shape):
         raise RuntimeError('Only support lower l2/l1 norm currently')
 
 
+@REGISTER_ORIG2PRIM('cast')
+def cast_orig2prim(op, x):
+    return primops.cast(x, paddle.dtype(op.attr('out_dtype')))
+
+
 # TODO: support broadcast
 @REGISTER_ORIG2PRIM('where')
 def select_orig2prim(op, condition, x, y):
@@ -356,15 +368,27 @@ def ge_orig2prim(op, x, y):
     return ge(x, y)
 
 
+# paddle.pow API use "elementwise_pow" operator when y is a Tensor.
 @REGISTER_ORIG2PRIM('elementwise_pow')
 def elementwise_pow_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
-
     z = primops.pow(x, y)
     return z
 
 
+# paddle.pow API use "pow" operator when y is a scalar.
+@REGISTER_ORIG2PRIM('pow')
+def pow_orig2prim(op, x, y):
+    # x is factorTensor defined in paddle phi op. Currently it is None.
+    return primops.pow(y, fill_const(op.attr('factor'), y.shape, y.dtype))
+
+
+@REGISTER_ORIG2PRIM('square')
+def square_orig2prim(op, x):
+    return primops.pow(x, fill_const(2., x.shape, x.dtype))
+
+
 @REGISTER_ORIG2PRIM('elementwise_max')
 def elementwise_max_orig2prim(op, x, y):
     if x.shape != y.shape:
@@ -415,6 +439,12 @@ def reduce_mean_orig2prim(op, x):
     return div(sum, norm)
 
 
+@REGISTER_ORIG2PRIM('size')
+def size_orig2prim(op, x):
+    return fill_const(functools.reduce(operator.mul, x.shape), (1, ),
+                      paddle.int64)
+
+
 ## Register prim2orig lower rules
 @REGISTER_PRIM2ORIG('add_p')
 def add_prim2orig(op, x, y):
@@ -592,6 +622,11 @@ def max_prim2orig(op, x, y):
     return paddle.maximum(x, y)
 
 
+@REGISTER_PRIM2ORIG('cast_p')
+def cast_prim2orig(op, x):
+    return paddle.cast(x, paddle.dtype(op.attr('dtype')))
+
+
 ## Register linearize rules
 @REGISTER_JVP('add_p')
 def add_jvp(op, x_dot, y_dot):
@@ -928,6 +963,12 @@ def max_jvp(op, x_dot, y_dot):
         return select(eq(y, z), y_dot, x_dot)
 
 
+@REGISTER_JVP('cast_p')
+def cast_jvp(op, x_dot):
+    y = op_position_output(op)
+    return primops.cast(x_dot, y.dtype)
+
+
 ## Register transpose rules
 
 
@@ -1132,3 +1173,9 @@ def select_transpose(op, check_dot, z_bar):
     y_bar = select(cond, zeros_y, z_bar) if check_dot(y) else None
 
     return cond_bar, x_bar, y_bar
+
+
+@REGISTER_TRANSPOSE('cast_p')
+def cast_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    return primops.cast(y_bar, x.dtype)
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 220246ef61551..1d87e4857cd0d 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -440,7 +440,7 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
 
                 # NOTE(Aurelius84): Since Paddle 2.0, we require gcc version > 5.x,
                 # so we add this flag to ensure the symbol names from user compiled
-                # shared library have same ABI suffix with core_(no)avx.so.
+                # shared library have same ABI suffix with libpaddle.so.
                 # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi
                 add_compile_flag(cflags, ['-D_GLIBCXX_USE_CXX11_ABI=1'])
                 # Append this macor only when jointly compiling .cc with .cu
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 6e6047ccda02e..4d9b14fad151b 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -399,10 +399,7 @@ def _get_core_name():
     """
     import paddle
     ext_name = '.pyd' if IS_WINDOWS else '.so'
-    if not paddle.fluid.core.load_noavx:
-        return 'core_avx' + ext_name
-    else:
-        return 'core_noavx' + ext_name
+    return 'libpaddle' + ext_name
 
 
 def _get_lib_core_path():
@@ -419,13 +416,13 @@ def _get_dll_core_path():
     Return real path of libcore_(no)avx.dylib on Windows.
     """
     raw_core_name = _get_core_name()
-    dll_core_name = "paddle_pybind.dll"
+    dll_core_name = "libpaddle.dll"
     return os.path.join(_get_fluid_path(), dll_core_name)
 
 
 def _reset_so_rpath(so_path):
     """
-    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
+    NOTE(Aurelius84): Runtime path of libpaddle.so is modified into `@loader_path/../libs`
     in setup.py.in. While loading custom op, `@loader_path` is the dirname of custom op
     instead of `paddle/fluid`. So we modify `@loader_path` from custom dylib into `@rpath`
     to ensure dynamic loader find it correctly.
@@ -524,7 +521,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
             # See _reset_so_rpath for details.
             extra_link_args.append('-Wl,-rpath,{}'.format(_get_fluid_path()))
             # On MacOS, ld don't support `-l:xx`, so we create a
-            # libcore_avx.dylib symbol link.
+            # liblibpaddle.dylib symbol link.
             lib_core_name = create_sym_link_if_not_exist()
             extra_link_args.append('-l{}'.format(lib_core_name))
         ###########################   -- END --    ###########################
@@ -555,7 +552,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
 
 def create_sym_link_if_not_exist():
     """
-    Create soft symbol link of `core_avx.so` or `core_noavx.so`
+    Create soft symbol link of `libpaddle.so`
     """
     assert OS_NAME.startswith('darwin') or IS_WINDOWS
 
@@ -574,7 +571,7 @@ def create_sym_link_if_not_exist():
                     .format(raw_core_name, new_dll_core_path, core_path,
                             raw_core_name))
                 run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
-        # core_avx or core_noavx with lib suffix
+        # libpaddle with lib suffix
         assert os.path.exists(new_dll_core_path)
         return raw_core_name[:-4] + ".lib"
 
@@ -590,7 +587,7 @@ def create_sym_link_if_not_exist():
                     "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`"
                     .format(raw_core_name, core_path, new_lib_core_path))
 
-        # core_avx or core_noavx without suffix
+        # libpaddle without suffix
         return raw_core_name[:-3]
 
 
@@ -779,7 +776,7 @@ def find_paddle_libraries(use_cuda=False):
             cuda_lib_dir = find_cuda_libraries()
             paddle_lib_dirs.extend(cuda_lib_dir)
 
-    # add `paddle/fluid` to search `core_avx.so` or `core_noavx.so`
+    # add `paddle/fluid` to search `libpaddle.so`
     paddle_lib_dirs.append(_get_fluid_path())
 
     return paddle_lib_dirs
diff --git a/python/setup.py.in b/python/setup.py.in
index d9481baf045fa..b895fb98cf37f 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -308,6 +308,8 @@ packages=['paddle',
           'paddle.distributed.fleet.metrics',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
+          'paddle.distributed.fleet.layers',
+          'paddle.distributed.fleet.layers.mpu',
           'paddle.distributed.fleet.meta_parallel',
           'paddle.distributed.fleet.meta_parallel.pp_utils',
           'paddle.distributed.fleet.meta_parallel.sharding',
@@ -451,8 +453,6 @@ else:
     package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
 
 package_data['paddle.fluid'] += ['${PADDLE_BINARY_DIR}/python/paddle/cost_model/static_op_benchmark.json']
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -494,7 +494,7 @@ else:
         package_data['paddle.libs'] += ['openblas' + ext_name]
     elif os.name == 'posix' and platform.machine() == 'aarch64' and '${OPENBLAS_LIB}'.endswith('so'):
         # copy the libopenblas.so on linux+aarch64
-        # special: core_noavx.so depends on 'libopenblas.so.0', not 'libopenblas.so'
+        # special: libpaddle.so without avx depends on 'libopenblas.so.0', not 'libopenblas.so'
         if os.path.exists('${OPENBLAS_LIB}' + '.0'):
             shutil.copy('${OPENBLAS_LIB}' + '.0', libs_path)
             package_data['paddle.libs'] += ['libopenblas.so.0']
@@ -589,8 +589,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
             commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
         else:
-            commands = ["patchelf --set-soname '${FLUID_CORE_NAME}.so' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
-            commands.append("patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
+            commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
             for command in commands:
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index f751709a767a5..5a6ad44e45d56 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -296,7 +296,7 @@ def parse_args():
                         dest='skipped',
                         type=str,
                         help='Skip Checking submodules',
-                        default='paddle.fluid.core_avx.eager.ops')
+                        default='paddle.fluid.libpaddle.eager.ops')
 
     if len(sys.argv) == 1:
         args = parser.parse_args(['paddle'])